In [1]:
import pandas as pd
import os
from openai import OpenAI
import openai
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Read the CSV dataset
file_path = './datasets/movies_embeddings.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

df = df.fillna('')

print(str(len(df)) + ' rows')

37918 rows


In [2]:
import ast

# Initialize the Qdrant client
qdrant_client = QdrantClient(':memory:')

# Set the collection name and size
collection_name = 'movies'
vector_size = len(ast.literal_eval(df['metadata_vector'][0]))  # Convert string to list and get its length

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Add vectors to the collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=[
        rest.PointStruct(
            id=k,
            vector={
                'metadata': ast.literal_eval(v['metadata_vector']),  # Convert string to list
            },
            payload=v.to_dict(),
        )
        for k, v in df.iterrows()
    ],
)

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

collections=[CollectionDescription(name='movies')]


CountResult(count=37918)

In [3]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=10):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [4]:
# Filter by conditions in Qdrant
def filter_qdrant(adult, date, collection_name, vector_name, top_k=10):
    query_results = qdrant_client.scroll(
        collection_name=collection_name,
        scroll_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="date",
                    range=models.DatetimeRange(
                        gt=date, # greater than
                        gte=None, # greater than or equal
                        lt=None, # less than
                        lte=None, # less than or equal
                    ),
                ),
                models.FieldCondition(
                    key="adult",
                    match=models.MatchValue(value=adult),
                ),
            ]
        ),
    )
    return query_results

In [9]:
query_results = filter_qdrant(adult=False, date='2005-02-08T10:49:00Z', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results[0]):
    print(f"{i + 1}. {vector.payload['title']} ")

1. A Sound of Thunder (2005) 
2. The Jacket (2005) 
3. The Interpreter (2005) 
4. The Ring Two (2005) 
5. A Scanner Darkly (2006) 
6. Charlie and the Chocolate Factory (2005) 
7. Underclassman (2005) 
8. Hitch (2005) 
9. Pooh's Heffalump Movie (2005) 
10. Inside Deep Throat (2005) 


In [10]:
# Search for similar vectors
query_results = query_qdrant('consultant', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['poster_link']} (Score: {round(vector.score, 3)})")

1. The Specialist (1994) https://images-na.ssl-images-amazon.com/images/M/MV5BYjMwZDMwZTItMTc2MC00NDRlLWI3YmUtNTg0ZmQ3MzdhNDJmXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_UY268_CR3,0,182,268_AL_.jpg (Score: 0.24)
2. Temptation: Confessions of a Marriage Counselor (2013) https://images-na.ssl-images-amazon.com/images/M/MV5BMTg3MzExNjU1N15BMl5BanBnXkFtZTcwMzk0ODU5OA@@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.22)
3. Advise & Consent (1962) https://images-na.ssl-images-amazon.com/images/M/MV5BMTcxMjU3OTA3NV5BMl5BanBnXkFtZTgwMTUxNTIxMDE@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.218)
4. The Expert (1995) https://images-na.ssl-images-amazon.com/images/M/MV5BMjA4MTQ5NDc5MV5BMl5BanBnXkFtZTcwNjQ2MzgxMQ@@._V1_UY268_CR8,0,182,268_AL_.jpg (Score: 0.216)
5. The Counselor (2013) https://images-na.ssl-images-amazon.com/images/M/MV5BMTc3ODk0MTY0N15BMl5BanBnXkFtZTgwOTU2MTEzMDE@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.214)
6. Guru (2007) https://images-na.ssl-images-amazon.com/images/M/MV5BMjAzMTkyMTQ0MF5BMl5Ban

In [11]:
# Format the response as json
import json
from datetime import datetime
import locale
import ast

def format_time_to_minutes(minutes_float):
    minutes_int = int(minutes_float)

    hours = minutes_int // 60
    minutes = minutes_int % 60
    
    if hours > 0:
        time_string = f"{hours}h {minutes}m"
    else:
        time_string = f"{minutes}m"
    
    return time_string

def format_as_dollars(number):
    locale.setlocale(locale.LC_ALL, '')

    number = int(number)
    formatted_number = locale.currency(number, grouping=True)
    formatted_number = formatted_number.replace(locale.localeconv()['currency_symbol'], "$")

    return formatted_number

def convert_utc_to_mm_dd_yyyy(utc_datetime_str):
    # Parse the UTC datetime string
    utc_datetime = datetime.strptime(utc_datetime_str, "%Y-%m-%dT%H:%M:%SZ")
    
    # Format the datetime to MM-DD-YYYY
    mm_dd_yyyy_format = utc_datetime.strftime("%m-%d-%Y")
    
    return mm_dd_yyyy_format

# Function to search for similar vectors
def search_movies_in_qdrant(query):
    query_results = query_qdrant(query, collection_name, 'metadata')

    results = []
    
    for i, vector in enumerate(query_results):
        tmp = {
            "rank": i,
            "title": vector.payload["title"],
            "summary": vector.payload["summary"],
            "date": convert_utc_to_mm_dd_yyyy(vector.payload["date"]), # convert this to MM-DD-YYYY format
            "genres": ast.literal_eval(vector.payload["genres"]),
            "runtime": format_time_to_minutes(vector.payload["runtime"]),
            "rating": vector.payload["rating"],
            "votes": int(vector.payload["votes"]),
            "budget": format_as_dollars(vector.payload["budget"]),
            "revenue": format_as_dollars(vector.payload["revenue"]),
            "language": vector.payload["language"],
            "adult": vector.payload["adult"],
            "production": ast.literal_eval(vector.payload["production"]),
            "poster_link": vector.payload["poster_link"]
        }
        results.append(tmp)

    return results

In [12]:
query = 'scary movies about monsters before year 1998'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

[
  {
    "rank": 0,
    "title": "Little Monsters (1989)",
    "summary": "A young boy is scared of the monster under his bed. He asks his 6th grade brother to swap rooms for the night as a bet that the monster really exists. Soon the brother becomes friends with the monster and discovers a whole new world of fun and games under his bed where pulling pranks on kids and other monsters is the main attraction",
    "date": "08-25-1989",
    "genres": [
      "Adventure",
      "Comedy",
      "Family"
    ],
    "runtime": "1h 42m",
    "rating": 6.1,
    "votes": 53,
    "budget": "$7,000,000.00",
    "revenue": "$793,775.00",
    "language": "English",
    "adult": false,
    "production": [
      ""
    ],
    "poster_link": "https://images-na.ssl-images-amazon.com/images/M/MV5BM2VjOThmNjktNGE1ZS00NWY3LThlMzItOWEyZjMwOGI4ZDNmXkEyXkFqcGdeQXVyMzM4MjM0Nzg@._V1_UX182_CR0,0,182,268_AL_.jpg"
  },
  {
    "rank": 1,
    "title": "Monster (2008)",
    "summary": "Two women, aspiring documenta

In [13]:
query = 'british movie about two people fall in love'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

[
  {
    "rank": 0,
    "title": "Born Romantic (2000)",
    "summary": "In modern-day London, three men (Craig Ferguson, Jimi Mistry and David Morrissey) and three women (Olivia Williams, Jane Horrocks and Catherine McCormack) fall in and out of love and back again, to the Greek-chorus accompaniment of two cab drivers, who engage in an ongoing conversation about sex. A winning romantic comedy, Born Romantic is the second feature by British writer-director David Kane of This Year's Love fame.",
    "date": "09-14-2000",
    "genres": [
      "Comedy"
    ],
    "runtime": "1h 36m",
    "rating": 6.5,
    "votes": 8,
    "budget": "$0.00",
    "revenue": "$0.00",
    "language": "English",
    "adult": false,
    "production": [
      "Kismet Film Company",
      "British Broadcasting Corporation (BBC)",
      "Harvest Pictures",
      "Random Harvest Pictures"
    ],
    "poster_link": "https://images-na.ssl-images-amazon.com/images/M/MV5BMTk5ODUxMjIzNV5BMl5BanBnXkFtZTcwMDI1MTMyMQ@@._

In [14]:
query = 'movie about eugenics'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

[
  {
    "rank": 0,
    "title": "Gattaca (1997)",
    "summary": "Science fiction drama about a future society in the era of indefinite eugenics where humans are set on a life course depending on their DNA. The young Vincent Freeman is born with a condition that would prevent him from space travel, yet he is determined to infiltrate the GATTACA space program.",
    "date": "09-07-1997",
    "genres": [
      "Drama",
      "Sci-Fi",
      "Thriller"
    ],
    "runtime": "1h 46m",
    "rating": 7.8,
    "votes": 1846,
    "budget": "$36,000,000.00",
    "revenue": "$12,532,777.00",
    "language": "English",
    "adult": false,
    "production": [
      "Columbia Pictures",
      "Jersey Films"
    ],
    "poster_link": "https://images-na.ssl-images-amazon.com/images/M/MV5BNDQxOTc0MzMtZmRlOS00OWQ5LWI2ZDctOTAwNmMwOTYxYzlhXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg"
  },
  {
    "rank": 1,
    "title": "Errors of the Human Body (2012)",
    "summary": "Canadian scienti