In [1]:
import pandas as pd
import os
from openai import OpenAI
import openai
from dotenv import load_dotenv, find_dotenv
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Read the CSV dataset
file_path = './datasets/movies_with_sentiment_and_embedding.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

df = df.fillna('')

print(str(len(df)) + ' rows')

8646 rows


In [2]:
# qdrant_client = QdrantClient(':memory:') # Uncomment this for testing locally

# Connect to the cloud version of the Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY'),
)

In [3]:
import ast

# Set the collection name and size
collection_name = 'movies'
vector_size = len(ast.literal_eval(df['metadata_vector'][0]))  # Convert string to list and get its length

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Calculate the length of payload that is being inserted into the Qdrant collection
def calculate_payload_length(payload):
    total_length = 0
    for value in payload.values():
        if isinstance(value, str):
            total_length += len(value)
        elif isinstance(value, list):
            for item in value:
                total_length += len(str(item))
        elif isinstance(value, dict):
            total_length += calculate_payload_length(value)
        else:
            total_length += len(str(value))
    return total_length

# Add vectors to the collection
request_length = 0

for k, v in df.iterrows():
    # Remove the 'metadata_vector' key from the dictionary to reduce the payload length
    result_dict = v.to_dict()
    if 'metadata_vector' in result_dict:
        del result_dict['metadata_vector']

    payload_length = calculate_payload_length(result_dict)
    vector_length = len(ast.literal_eval(v['metadata_vector']))
    total_length = payload_length + vector_length
    # print(f"Payload length for point {k}: {total_length}")

    request_length = request_length + total_length
    
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[
            rest.PointStruct(
                id=k,
                vector={
                    'metadata': ast.literal_eval(v['metadata_vector']),  # Convert string to list
                },
                payload=result_dict,
            )
        ]
    )

print(f"Payload & vector length for all points: {request_length}")

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

Payload & vector length for all points: 21308659
collections=[CollectionDescription(name='movies')]


CountResult(count=8646)

In [4]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [5]:
# Search for similar vectors and store a result
query_results = query_qdrant('movie about two people fall in love', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['poster_link']} (Score: {round(vector.score, 3)})")

1. Falling in Love (1984) https://images-na.ssl-images-amazon.com/images/M/MV5BYmUxNDMyOGQtNTQ1MS00NzhiLWE5ZDUtZGFmMDcyODVmYzU5XkEyXkFqcGdeQXVyMTA0MjU0Ng@@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.547)
2. Two Lovers (2008) https://images-na.ssl-images-amazon.com/images/M/MV5BMTIzNzIzMzc2Ml5BMl5BanBnXkFtZTcwNDE2MjAyMg@@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.542)
3. Bandits (2001) https://images-na.ssl-images-amazon.com/images/M/MV5BMTkyMzA3OTI3NV5BMl5BanBnXkFtZTYwMjU0ODM3._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.495)
4. Down with Love (2003) https://images-na.ssl-images-amazon.com/images/M/MV5BYjY0OTMyNmMtODEwZS00ZWJiLWI0ZjctNmNiNmJjMTMzYWM1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.481)
5. Freier Fall (2013) https://images-na.ssl-images-amazon.com/images/M/MV5BMTc3MjgwNTEzNV5BMl5BanBnXkFtZTcwNTM5MzY5OQ@@._V1_UY268_CR3,0,182,268_AL_.jpg (Score: 0.478)
