In [1]:
import pandas as pd
import os
import logging

# Read the CSV dataset
file_path = './datasets/movies.csv'
df = pd.read_csv(file_path).head(100)

# Perform the data cleaning
def clean_dataset(text, sep_token=' \n '):
    if pd.isna(text):
        return ''
    else:
        text = str(text)
        text = text.strip()
    return text

df['summary'] = df['summary'].apply(clean_dataset)
df['title'] = df['title'].apply(clean_dataset)

df = df.fillna('')

df.head(5)

Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link
0,114709.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",10/30/95,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,https://images-na.ssl-images-amazon.com/images...
1,113497.0,Jumanji,When siblings Judy and Peter discover an encha...,12/15/95,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Com...",https://images-na.ssl-images-amazon.com/images...
2,113228.0,Grumpier Old Men,A family wedding reignites the ancient feud be...,12/22/95,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate",https://images-na.ssl-images-amazon.com/images...
3,114885.0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",12/22/95,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,https://images-na.ssl-images-amazon.com/images...
4,113041.0,Father of the Bride Part II,Just when George Banks has recovered from his ...,2/10/95,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures",https://images-na.ssl-images-amazon.com/images...


In [2]:
from openai import OpenAI
import openai

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Generate the vector embeddings for selected columns
def get_embedding(text, model='text-embedding-3-small'):
    try:
        text = text.replace('\n', ' ')
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    except Exception as e:
        logging.error(f'Error generating embeddings: {e}. Found issue in the following text: {text}.')
        text = 'No data available'
        return client.embeddings.create(input = [text], model=model).data[0].embedding

df['summary_vector'] = df['summary'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df['title_vector'] = df['title'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
print('Conversion to vector embeddings has been completed.')

df.head(5)

ERROR:root:Error generating embeddings: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}. Found issue in the following text: .


Conversion to vector embeddings has been completed.


Unnamed: 0,imdb_id,title,summary,date,genres,runtime,rating,votes,budget,revenue,language,adult,production,poster_link,summary_vector,title_vector
0,114709.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",10/30/95,"Animation, Adventure, Comedy",81.0,8.3,5415.0,30000000,373554033.0,en,False,Pixar Animation Studios,https://images-na.ssl-images-amazon.com/images...,"[0.015744103118777275, -0.00884184055030346, -...","[-0.007819436490535736, -0.018978964537382126,..."
1,113497.0,Jumanji,When siblings Judy and Peter discover an encha...,12/15/95,"Action, Adventure, Family",104.0,6.9,2413.0,65000000,262797249.0,en,False,"TriStar Pictures, Teitler Film, Interscope Com...",https://images-na.ssl-images-amazon.com/images...,"[-0.003799290396273136, 0.05464206635951996, 0...","[-0.027284517884254456, 0.046195078641176224, ..."
2,113228.0,Grumpier Old Men,A family wedding reignites the ancient feud be...,12/22/95,"Comedy, Romance",101.0,6.6,92.0,0,0.0,en,False,"Warner Bros., Lancaster Gate",https://images-na.ssl-images-amazon.com/images...,"[-0.0046460190787911415, 0.06853807717561722, ...","[0.008933168835937977, 0.04662678390741348, -0..."
3,114885.0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",12/22/95,"Comedy, Drama, Romance",127.0,5.7,34.0,16000000,81452156.0,en,False,Twentieth Century Fox Film Corporation,https://images-na.ssl-images-amazon.com/images...,"[-0.006611499935388565, 0.05524144694209099, -...","[0.011703846044838428, -0.005784958600997925, ..."
4,113041.0,Father of the Bride Part II,Just when George Banks has recovered from his ...,2/10/95,"Comedy, Family, Romance",106.0,5.9,173.0,0,76578911.0,en,False,"Sandollar Productions, Touchstone Pictures",https://images-na.ssl-images-amazon.com/images...,"[0.027686908841133118, 0.028660569339990616, 0...","[0.005981959868222475, 0.0660114511847496, -0...."


In [3]:
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

# Initialize the Qdrant client
qdrant_client = QdrantClient(':memory:')

# Set the collection name and size
collection_name = 'movies'
vector_size = len(df['summary_vector'][0])

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'title': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
        'summary': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Add vectors to the collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=[
        rest.PointStruct(
            id=k,
            vector={
                'title': v['title_vector'],
                'summary': v['summary_vector'],
            },
            payload=v.to_dict(),
        )
        for k, v in df.iterrows()
    ],
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [4]:
print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

collections=[CollectionDescription(name='movies')]


CountResult(count=100)

In [5]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name='title', top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [6]:
# Search for similar vectors and store a result
query_results = query_qdrant('movies about toys', collection_name)

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} (Score: {round(vector.score, 3)})")

1. Toy Story (Score: 0.604)
2. The City of Lost Children (Score: 0.407)
3. Lawnmower Man 2: Beyond Cyberspace (Score: 0.382)
4. Jumanji (Score: 0.37)
5. Kicking and Screaming (Score: 0.369)


In [7]:
# Search for similar vectors to a selected vector
query_results = query_qdrant('scary movies about monsters', collection_name, 'summary')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} (Score: {round(vector.score, 3)})")

1. Screamers (Score: 0.323)
2. Vampire in Brooklyn (Score: 0.304)
3. Se7en (Score: 0.301)
4. Mortal Kombat (Score: 0.297)
5. Dracula: Dead and Loving It (Score: 0.283)
