In [13]:
import pandas as pd
import os
from openai import OpenAI
import openai
from dotenv import load_dotenv, find_dotenv
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Read the CSV dataset
file_path = '../datasets/movies_051424.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

df = df.fillna('')

print(str(len(df)) + ' rows')
df.head()

10638 rows


Unnamed: 0,id,title,summary,year,certificate,runtime,runtime_mins,rating,votes,director_1,...,sentiment_score,sentiment_reason,recommended_audience,genres,directors,writers,casts,img,metadata,metadata_vector
0,15239678,Dune: Part Two,Paul Atreides unites with Chani and the Fremen...,2024,PG-13,2h 46m,166,8.7,360000,Denis Villeneuve,...,3,The movie 'Dune: Part Two' showcases themes of...,This movie is recommended for viewers who appr...,"['Action', 'Adventure', 'Drama']",['Denis Villeneuve'],"['Denis Villeneuve', 'Jon Spaihts', 'Frank Her...","['Timothée Chalamet', 'Zendaya', 'Rebecca Ferg...",https://m.media-amazon.com/images/M/MV5BN2QyZG...,Dune: Part Two | Denis Villeneuve | Timothée C...,"[-0.022296011447906494, 0.044338658452034, -0...."
1,14539740,Godzilla x Kong: The New Empire,"Two ancient titans, Godzilla and Kong, clash i...",2024,PG-13,1h 55m,115,6.5,37000,Adam Wingard,...,7,The movie ends on a positive note with Godzill...,This film is recommended for audiences who enj...,"['Action', 'Adventure', 'Fantasy']",['Adam Wingard'],"['Terry Rossio', 'Simon Barrett', 'Jeremy Slat...","['Rebecca Hall', 'Brian Tyree Henry', 'Dan Ste...",https://m.media-amazon.com/images/M/MV5BY2QwOG...,Godzilla x Kong: The New Empire | Adam Wingard...,"[-0.004718017764389515, 0.05989082157611847, 0..."
2,23137904,Rebel Moon - Part Two: The Scargiver,Kora and surviving warriors prepare to defend ...,2024,PG-13,2h 2m,122,5.2,17000,Zack Snyder,...,7,"The synopsis highlights themes of unity, resil...",This movie is recommended for audiences who en...,"['Action', 'Adventure', 'Drama']",['Zack Snyder'],"['Zack Snyder', 'Kurt Johnstad', 'Shay Hatten']","['Sofia Boutella', 'Djimon Hounsou', 'Ed Skrein']",https://m.media-amazon.com/images/M/MV5BYmQ2OD...,Rebel Moon - Part Two: The Scargiver | Zack Sn...,"[0.024749718606472015, 0.075119748711586, 0.01..."
3,21692408,Kung Fu Panda 4,After Po is tapped to become the Spiritual Lea...,2024,PG,1h 34m,94,6.4,30000,Mike Mitchell,...,8,The synopsis portrays a heartwarming story of ...,This movie is recommended for audiences seekin...,"['Animation', 'Action', 'Adventure']","['Mike Mitchell', 'Stephanie Stine']","['Jonathan Aibel', 'Glenn Berger', 'Darren Lem...","['Jack Black', 'Awkwafina', 'Viola Davis']",https://m.media-amazon.com/images/M/MV5BZDY0Yz...,"Kung Fu Panda 4 | Mike Mitchell | Jack Black, ...","[-0.019859328866004944, 0.011722062714397907, ..."
4,1160419,Dune,A noble family becomes embroiled in a war for ...,2021,PG-13,2h 35m,155,8.0,851000,Denis Villeneuve,...,6,The synopsis portrays a mix of challenges and ...,Recommended for viewers who appreciate epic ta...,"['Action', 'Adventure', 'Drama']",['Denis Villeneuve'],"['Jon Spaihts', 'Denis Villeneuve', 'Eric Roth']","['Timothée Chalamet', 'Rebecca Ferguson', 'Zen...",https://m.media-amazon.com/images/M/MV5BMDQ0Nj...,"Dune | Denis Villeneuve | Timothée Chalamet, R...","[-0.030934041365981102, 0.032390281558036804, ..."


In [14]:
# qdrant_client = QdrantClient(':memory:') # Uncomment this for testing locally

# Connect to the cloud version of the Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY'),
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='songs')]


In [16]:
import ast

# Set the collection name and size
collection_name = 'movies'
vector_size = len(ast.literal_eval(df['metadata_vector'][0]))  # Convert string to list and get its length

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Calculate the length of payload that is being inserted into the Qdrant collection
def calculate_payload_length(payload):
    total_length = 0
    for value in payload.values():
        if isinstance(value, str):
            total_length += len(value)
        elif isinstance(value, list):
            for item in value:
                total_length += len(str(item))
        elif isinstance(value, dict):
            total_length += calculate_payload_length(value)
        else:
            total_length += len(str(value))
    return total_length

# Add vectors to the collection
request_length = 0

for k, v in df.iterrows():
    # Remove the 'metadata_vector' key from the dictionary to reduce the payload length
    result_dict = v.to_dict()
    if 'metadata_vector' in result_dict:
        del result_dict['metadata_vector']

    payload_length = calculate_payload_length(result_dict)
    vector_length = len(ast.literal_eval(v['metadata_vector']))
    total_length = payload_length + vector_length
    # print(f"Payload length for point {k}: {total_length}")

    request_length = request_length + total_length
    
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[
            rest.PointStruct(
                id=k,
                vector={
                    'metadata': ast.literal_eval(v['metadata_vector']),  # Convert string to list
                },
                payload=result_dict,
            )
        ]
    )

print(f"Payload & vector length for all points: {request_length}")

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

Payload & vector length for all points: 28554374
collections=[CollectionDescription(name='songs'), CollectionDescription(name='movies')]


CountResult(count=10638)

In [17]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [19]:
# Search for similar vectors and store a result
collection_name = 'movies'
query_results = query_qdrant('paris texas', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['img']} (Score: {round(vector.score, 3)})")

1. Paris, Texas https://m.media-amazon.com/images/M/MV5BM2RjMmU3ZWItYzBlMy00ZmJkLWE5YzgtNTVkODdhOWM3NGZhXkEyXkFqcGdeQXVyNDA5Mjg5MjA@.jpg (Score: 0.596)
2. Paris, I Love You https://m.media-amazon.com/images/M/MV5BMTc1MDgwNDE4MF5BMl5BanBnXkFtZTcwMTQzMzc0MQ@@.jpg (Score: 0.414)
3. Paris Is Burning https://m.media-amazon.com/images/M/MV5BNzk1ODAzNTQtYTQyOC00ZTcxLWE3Y2ItZmJhNGI2MGM5NjkwXkEyXkFqcGdeQXVyMTQxNzMzNDI@.jpg (Score: 0.388)
4. Happy, Texas https://m.media-amazon.com/images/M/MV5BY2U4YWYzY2YtZDczNS00YTIxLWI4YzItZWE3YzIwZmJkYTAyXkEyXkFqcGdeQXVyMTcwOTQzOTYy._V1_QL75_UX140_CR0,0,140,207_.jpg@.jpg (Score: 0.383)
5. A Monster in Paris https://m.media-amazon.com/images/M/MV5BYWY0OWI4OWYtM2U5OS00ZDZmLTk3MTUtNDg2MDhmYjUxNGY2XkEyXkFqcGdeQXVyMTU3NDg0OTgx._V1_QL75_UX140_CR0,1,140,207_.jpg@.jpg (Score: 0.377)


In [55]:
def filter_qdrant():
    completion = openai.embeddings.create(
        input='',
        model='text-embedding-3-small'
    )
    embedded_query = completion.data[0].embedding

    genreArray = ['Action']
    
    query_results = qdrant_client.scroll(
        collection_name='movies',
        scroll_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="year",
                    match=models.MatchValue(value=1996),
                ),
            ],
            should=[
                models.FieldCondition(
                    key="genre_1",
                    match=models.MatchAny(any=genreArray),
                ),
                models.FieldCondition(
                    key="genre_2",
                    match=models.MatchAny(any=genreArray),
                ),
                models.FieldCondition(
                    key="genre_3",
                    match=models.MatchAny(any=genreArray),
                ),
            ],
        ),
        limit=5,
    )

    return query_results

In [56]:
import json
query_results = filter_qdrant()

# print(query_results)

results = []
for record in query_results[0]:
    # id = record.id
    payload = record.payload
    tmp = {
            "id": payload["id"],
            "title": payload["title"],
            "year": payload["year"],
            "genre_1": payload["genre_1"],
            "genre_2": payload["genre_2"],
            "genre_3": payload["genre_3"],
            "genres": payload["genres"]
        }
    results.append(tmp)
print(json.loads(json.dumps(results)))

[{'id': 116629, 'title': 'Independence Day', 'year': 1996, 'genre_1': 'Action', 'genre_2': 'Adventure', 'genre_3': 'Sci-Fi', 'genres': "['Action', 'Adventure', 'Sci-Fi']"}, {'id': 117998, 'title': 'Twister', 'year': 1996, 'genre_1': 'Action', 'genre_2': 'Adventure', 'genre_3': 'Thriller', 'genres': "['Action', 'Adventure', 'Thriller']"}, {'id': 117500, 'title': 'The Rock', 'year': 1996, 'genre_1': 'Action', 'genre_2': 'Adventure', 'genre_3': 'Thriller', 'genres': "['Action', 'Adventure', 'Thriller']"}, {'id': 117060, 'title': 'Mission: Impossible', 'year': 1996, 'genre_1': 'Action', 'genre_2': 'Adventure', 'genre_3': 'Thriller', 'genres': "['Action', 'Adventure', 'Thriller']"}, {'id': 117705, 'title': 'Space Jam', 'year': 1996, 'genre_1': 'Animation', 'genre_2': 'Adventure', 'genre_3': 'Comedy', 'genres': "['Animation', 'Adventure', 'Comedy']"}]
