In [1]:
import pandas as pd
import os
from openai import OpenAI
import openai
from dotenv import load_dotenv, find_dotenv
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Read the CSV dataset
file_path = '../datasets/movies_final.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

df = df.fillna('')

print(str(len(df)) + ' rows')
df.head()

10638 rows


Unnamed: 0,id,title,summary,year,genre,certificate,runtime,runtime_mins,rating,votes,sentiment_compound,sentiment_normalized,img,metadata,metadata_vector
0,15239678,Dune: Part Two,Paul Atreides unites with Chani and the Fremen...,2024,Adventure,PG-13,2h 46m,166,8.7,360000,-0.765,1,https://m.media-amazon.com/images/M/MV5BN2QyZG...,Dune: Part Two | Paul Atreides unites with Cha...,"[-0.011729336343705654, 0.04041394963860512, 0..."
1,14539740,Godzilla x Kong: The New Empire,"Two ancient titans, Godzilla and Kong, clash i...",2024,Adventure,PG-13,1h 55m,115,6.5,37000,-0.3818,2,https://m.media-amazon.com/images/M/MV5BY2QwOG...,Godzilla x Kong: The New Empire | Two ancient ...,"[0.007644626311957836, 0.03877980634570122, 0...."
2,23137904,Rebel Moon - Part Two: The Scargiver,Kora and surviving warriors prepare to defend ...,2024,Adventure,PG-13,2h 2m,122,5.2,17000,0.2023,3,https://m.media-amazon.com/images/M/MV5BYmQ2OD...,Rebel Moon - Part Two: The Scargiver | Kora an...,"[0.043467093259096146, 0.0680377185344696, 0.0..."
3,21692408,Kung Fu Panda 4,After Po is tapped to become the Spiritual Lea...,2024,Adventure,PG,1h 34m,94,6.4,30000,-0.8957,1,https://m.media-amazon.com/images/M/MV5BZDY0Yz...,Kung Fu Panda 4 | After Po is tapped to become...,"[-0.018895048648118973, 0.019839802756905556, ..."
4,1160419,Dune,A noble family becomes embroiled in a war for ...,2021,Adventure,PG-13,2h 35m,155,8.0,851000,0.489,4,https://m.media-amazon.com/images/M/MV5BMDQ0Nj...,Dune | A noble family becomes embroiled in a w...,"[-0.04253479093313217, 0.03401784226298332, 0...."


In [2]:
# qdrant_client = QdrantClient(':memory:') # Uncomment this for testing locally

# Connect to the cloud version of the Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY'),
)

In [3]:
import ast

# Set the collection name and size
collection_name = 'movies'
vector_size = len(ast.literal_eval(df['metadata_vector'][0]))  # Convert string to list and get its length

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Calculate the length of payload that is being inserted into the Qdrant collection
def calculate_payload_length(payload):
    total_length = 0
    for value in payload.values():
        if isinstance(value, str):
            total_length += len(value)
        elif isinstance(value, list):
            for item in value:
                total_length += len(str(item))
        elif isinstance(value, dict):
            total_length += calculate_payload_length(value)
        else:
            total_length += len(str(value))
    return total_length

# Add vectors to the collection
request_length = 0

for k, v in df.iterrows():
    # Remove the 'metadata_vector' key from the dictionary to reduce the payload length
    result_dict = v.to_dict()
    if 'metadata_vector' in result_dict:
        del result_dict['metadata_vector']

    payload_length = calculate_payload_length(result_dict)
    vector_length = len(ast.literal_eval(v['metadata_vector']))
    total_length = payload_length + vector_length
    # print(f"Payload length for point {k}: {total_length}")

    request_length = request_length + total_length
    
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[
            rest.PointStruct(
                id=k,
                vector={
                    'metadata': ast.literal_eval(v['metadata_vector']),  # Convert string to list
                },
                payload=result_dict,
            )
        ]
    )

print(f"Payload & vector length for all points: {request_length}")

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

Payload & vector length for all points: 21643336
collections=[CollectionDescription(name='songs'), CollectionDescription(name='movies')]


CountResult(count=10638)

In [4]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [5]:
# Search for similar vectors and store a result
query_results = query_qdrant('la la land', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['img']} (Score: {round(vector.score, 3)})")

1. La La Land https://m.media-amazon.com/images/M/MV5BMzUzNDM2NzM2MV5BMl5BanBnXkFtZTgwNTM3NTg4OTE@.jpg (Score: 0.519)
2. Land https://m.media-amazon.com/images/M/MV5BMDk4YmZiZjYtMjgxMi00ZWU4LTk5MTQtMTBjOTZiOGUxOTM2XkEyXkFqcGdeQXVyODk4OTc3MTY@.jpg (Score: 0.423)
3. Promised Land https://m.media-amazon.com/images/M/MV5BMTQxNDYzNzgyOF5BMl5BanBnXkFtZTcwNTU0NTI1OA@@.jpg (Score: 0.385)
4. Cop Land https://m.media-amazon.com/images/M/MV5BNmNhMzI0NmQtMzU1OS00NzQzLTg0NzctZDJkODZlMjU3OTc5XkEyXkFqcGdeQXVyNDc2NjEyMw@@.jpg (Score: 0.374)
5. Inland Empire https://m.media-amazon.com/images/M/MV5BMTQ3NDQyNjM3NF5BMl5BanBnXkFtZTcwMzk5MDU0MQ@@.jpg (Score: 0.368)
