In [87]:
import pandas as pd
import os
# import logging
from openai import OpenAI
import openai
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Read the CSV dataset
file_path = './datasets/movies_embedding.csv'
df = pd.read_csv(file_path) # Add .head(100) if you want to limit the number of rows

df = df.fillna('')

from datetime import datetime, timezone

def convert_to_utc(date_str):
    date_obj = datetime.strptime(date_str, '%m/%d/%Y')
    utc_date_obj = date_obj.replace(tzinfo=timezone.utc)
    utc_date_str = utc_date_obj.strftime('%Y-%m-%dT%H:%M:%SZ')
    return utc_date_str

# Apply the conversion function to all values in the date column
df['date'] = df['date'].apply(convert_to_utc)

print(str(len(df)) + ' rows')

ValueError: time data '10/30/95' does not match format '%m/%d/%Y'

In [57]:
import ast

# Initialize the Qdrant client
qdrant_client = QdrantClient(':memory:')

# Set the collection name and size
collection_name = 'movies'
vector_size = len(ast.literal_eval(df['metadata_vector'][0]))  # Convert string to list and get its length

# Create a collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        'metadata': rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

# Add vectors to the collection
qdrant_client.upsert(
    collection_name=collection_name,
    points=[
        rest.PointStruct(
            id=k,
            vector={
                'metadata': ast.literal_eval(v['metadata_vector']),  # Convert string to list
            },
            payload=v.to_dict(),
        )
        for k, v in df.iterrows()
    ],
)

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

collections=[CollectionDescription(name='movies')]


CountResult(count=40179)

In [58]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=20):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [84]:
# Filter by conditions in Qdrant
def filter_qdrant(adult, date, collection_name, vector_name, top_k=5):
    query_results = qdrant_client.scroll(
        collection_name=collection_name,
        scroll_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="date",
                    range=models.DatetimeRange(
                        gt=convert_to_utc(date), # greater than
                        gte=None, # greater than or equal
                        lt=None, # less than
                        lte=None, # less than or equal
                    ),
                ),
                models.FieldCondition(
                    key="adult",
                    match=models.MatchValue(value=adult),
                ),
            ]
        ),
    )
    return query_results

In [85]:
query_results = filter_qdrant(adult=False, date='01/01/2000', collection_name=collection_name, vector_name='metadata')

# print(query_results)

for i, vector in enumerate(query_results[0]):
    print(f"{i + 1}. {vector.payload['title']} ")

In [61]:
# Search for similar vectors and store a result
query_results = query_qdrant('consultant', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['poster_link']} (Score: {round(vector.score, 3)})")

1. The Specialist (1994) https://images-na.ssl-images-amazon.com/images/M/MV5BYjMwZDMwZTItMTc2MC00NDRlLWI3YmUtNTg0ZmQ3MzdhNDJmXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_UY268_CR3,0,182,268_AL_.jpg (Score: 0.24)
2. Temptation: Confessions of a Marriage Counselor (2013) https://images-na.ssl-images-amazon.com/images/M/MV5BMTg3MzExNjU1N15BMl5BanBnXkFtZTcwMzk0ODU5OA@@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.22)
3. Advise & Consent (1962) https://images-na.ssl-images-amazon.com/images/M/MV5BMTcxMjU3OTA3NV5BMl5BanBnXkFtZTgwMTUxNTIxMDE@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.218)
4. The Expert (1995) https://images-na.ssl-images-amazon.com/images/M/MV5BMjA4MTQ5NDc5MV5BMl5BanBnXkFtZTcwNjQ2MzgxMQ@@._V1_UY268_CR8,0,182,268_AL_.jpg (Score: 0.216)
5. The Counselor (2013) https://images-na.ssl-images-amazon.com/images/M/MV5BMTc3ODk0MTY0N15BMl5BanBnXkFtZTgwOTU2MTEzMDE@._V1_UX182_CR0,0,182,268_AL_.jpg (Score: 0.214)
6. Guru (2007) https://images-na.ssl-images-amazon.com/images/M/MV5BMjAzMTkyMTQ0MF5BMl5Ban

In [62]:
# Format the response as JSON
import json
from datetime import datetime
import locale

# Assume year in 2000's if between 00 to 24, otherwise, in 1900's
def convert_date_format(date_str):
    month, day, year = map(int, date_str.split('/'))
    
    if year >= 0 and year <= 24:
        year += 2000
    else:
        year += 1900

    date_obj = datetime(year, month, day)
    formatted_date = date_obj.strftime('%m/%d/%Y')
    
    return formatted_date

def string_to_array(str):
    arr = str.split(',')
    arr = [arr.strip() for arr in arr]

    return arr

def format_time(minutes_float):
    minutes_int = int(minutes_float)

    hours = minutes_int // 60
    minutes = minutes_int % 60
    
    if hours > 0:
        time_string = f"{hours}h {minutes}m"
    else:
        time_string = f"{minutes}m"
    
    return time_string

def format_as_dollars(number):
    locale.setlocale(locale.LC_ALL, '')

    number = int(number)
    formatted_number = locale.currency(number, grouping=True)
    formatted_number = formatted_number.replace(locale.localeconv()['currency_symbol'], "$")

    return formatted_number

def get_country_name(abbreviation):
    country_names = {'en': 'English', 'fr': 'French', 'zh': 'Chinese', 'it': 'Italian', 'fa': 'Persian', 'nl': 'Dutch', 'de': 'German', 'cn': 'Chinese', 'ar': 'Arabic', 'es': 'Spanish', 'ru': 'Russian', 'sv': 'Swedish', 'ja': 'Japanese', 'ko': 'Korean', 'sr': 'Serbian', 'bn': 'Bengali', 'he': 'Hebrew', 'pt': 'Portuguese', 'wo': 'Wolof', 'ro': 'Romanian', 'hu': 'Hungarian', 'cy': 'Welsh', 'vi': 'Vietnamese'}
    return country_names.get(abbreviation, '')

# Function to search for similar vectors
def search_movies_in_qdrant(query):
    query_results = query_qdrant(query, collection_name, 'metadata')

    results = []
    
    for i, vector in enumerate(query_results):
        tmp = {
            "rank": i,
            "title": vector.payload["title"],
            "summary": vector.payload["summary"],
            "date": convert_date_format(vector.payload["date"]),
            "genres": string_to_array(vector.payload["genres"]),
            "runtime": format_time(vector.payload["runtime"]),
            "rating": vector.payload["rating"],
            "votes": int(vector.payload["votes"]),
            "budget": format_as_dollars(vector.payload["budget"]),
            "revenue": format_as_dollars(vector.payload["revenue"]),
            "language": get_country_name(vector.payload["language"]),
            "adult": vector.payload["adult"],
            "production": string_to_array(vector.payload["production"]),
            "poster_link": vector.payload["poster_link"]
        }
        results.append(tmp)

    return results

In [63]:
query = 'scary movies about monsters after year 2003'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

[
  {
    "rank": 0,
    "title": "Monster (2008)",
    "summary": "Two women, aspiring documentary filmmakers, find themselves trapped in a monster-plagued Toyko in 2003.",
    "date": "01/18/2008",
    "genres": [
      "Action",
      "Horror",
      "Thriller"
    ],
    "runtime": "1h 26m",
    "rating": 2.1,
    "votes": 7,
    "budget": "$0.00",
    "revenue": "$0.00",
    "language": "English",
    "adult": false,
    "production": [
      "Asylum",
      "The"
    ],
    "poster_link": "https://images-na.ssl-images-amazon.com/images/M/MV5BMTQ2NjIxNTg5N15BMl5BanBnXkFtZTcwNzE0NzAzOA@@._V1_UY268_CR1,0,182,268_AL_.jpg"
  },
  {
    "rank": 1,
    "title": "Monster House (2006)",
    "summary": "Monsters under the bed are scary enough, but what happens when an entire house is out to get you? Three teens aim to find out when they go up against a decrepit neighboring home and unlock its frightening secrets.",
    "date": "07/21/2006",
    "genres": [
      "Animation",
      "Adventu