In [1]:
from openai import OpenAI
import openai
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

In [2]:
from qdrant_client import models, QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Record

# Connect to the Qdrant cloud
qdrant_client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY'),
)

collection_name = 'movies'
vector_name='metadata'

print(qdrant_client.get_collections())
qdrant_client.count(collection_name=collection_name)

collections=[CollectionDescription(name='songs'), CollectionDescription(name='movies')]


CountResult(count=10637)

In [3]:
# Generate a query embedding and search in Qdrant
def query_qdrant(query, collection_name, vector_name, top_k=5):
    # Creates embedding vector from user query
    completion = openai.embeddings.create(
        input=query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )

    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [6]:
# Search for similar vectors and store a result
query_results = query_qdrant("Godzilla x Kong: The New Empire | Two ancient titans, Godzilla and Kong, clash in an epic battle as humans unravel their intertwined origins and connection to Skull Island's mysteries.", collection_name=collection_name, vector_name=vector_name)

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['img']} (Score: {round(vector.score, 3)})")

1. Godzilla x Kong: The New Empire https://m.media-amazon.com/images/M/MV5BY2QwOGE2NGQtMWQwNi00M2IzLThlNWItYWMzNGQ5YWNiZDA4XkEyXkFqcGdeQXVyNTE1NjY5Mg@@.jpg (Score: 1.0)
2. Godzilla vs. Kong https://m.media-amazon.com/images/M/MV5BOTI3NzNhNDItYWY0My00ZmE3LTgxMmYtYzQzZWU2NDcxYzVjXkEyXkFqcGdeQXVyODE5NzE3OTE@.jpg (Score: 0.753)
3. Kong: Skull Island https://m.media-amazon.com/images/M/MV5BMWUxNjZiNTAtZmMwMi00MmYxLTkxZGEtZWE1MzU0OWFjOTNhXkEyXkFqcGdeQXVyNzk1MzI3MTI@.jpg (Score: 0.685)
4. King Kong vs. Godzilla https://m.media-amazon.com/images/M/MV5BNGExNmFhMDItZDBiZC00MWUyLWEwYjEtYTgyYzIzNTQyZTYwXkEyXkFqcGdeQXVyMTIyNjQ1OTMy._V1_QL75_UX140_CR0,2,140,207_.jpg@.jpg (Score: 0.665)
5. Godzilla: King of the Monsters https://m.media-amazon.com/images/M/MV5BOGFjYWNkMTMtMTg1ZC00Y2I4LTg0ZTYtN2ZlMzI4MGQwNzg4XkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg (Score: 0.634)


In [26]:
import json

def parse_user_query(user_query):

    country_names = ['English', 'French', 'Chinese', 'Italian', 'Persian', 'Dutch', 'German', 'Arabic', 'Spanish', 'Russian', 'Swedish', 'Japanese', 'Korean', 'Serbian', 'Bengali', 'Hebrew', 'Portuguese', 'Wolof', 'Romanian', 'Hungarian', 'Welsh', 'Vietnamese']

    genres = ['Animation', 'Action', 'Comedy', 'Biography', 'Crime', 'Drama', 'Adventure', 'Fantasy', 'Mystery', 'Sci-Fi', 'Documentary', 'Horror', 'Family', 'Romance', 'Film-Noir', 'Western', 'Musical', 'Thriller', 'War', 'Short', 'Music']
    
    prompt_template = f"""
        Your task is to parse the following query "{user_query}" provided by a user and generate JSON output according to the template provided later.\n
        The explanation of each value in the JSON template is as follows:\n
        "date": "This field represents the release year, month, and/or date of the movie. You also need to provide the condition to indicate if user wants to query a movie before, after, or between specific range of date(s). If any of these values are provided in the query, format it as follows: 2005-02-08T10:49:00Z. If not specified, leave empty.",
        "genres": "This field represents the genre of the movie. If user mention one the following genres in the list '{genres}', add it as a value. If not specified, please leave empty.",
        "runtime": "The runtime should be an integer value representing the duration of the movie in minutes. You also need to provide the condition to indicate if user wants to query a movie greater than, less than, or between specific range of time or timeframe. If not specified, leave empty.",
        "rating": "The rating should be an integer value representing the movie's rating. The rating should be between 1 to 9, but when user says high rating or higher rating, it means rating is greater than 8. You also need to provide the condition to indicate if user wants to query a movie greater than, less than, or between specific range of rating(s). If not specified, leave empty.",
        "budget": "The budget should be an integer value representing the movie's budget. You also need to provide the condition to indicate if user wants to query a movie greater than, less than, or between specific range of budget. If not specified, leave empty.",
        "revenue": "The revenue should be an integer value representing the movie's revenue. You also need to provide the condition to indicate if user wants to query a movie greater than, less than, or between specific range of revenue. If not specified, leave empty.",
        "sentiment": "Please analyze the sentiment of the movie. If the user mentions terms like 'sad' or 'bad', consider it as 'negative'. If terms like 'happy' or 'good' are mentioned, consider it as 'positive'. If sentiment is not specified in the user query, leave it empty.",
        "language": "The language should be from the following list: {country_names}. If not specified, leave empty."\n
        Below is the JSON tempalte:
        {{
            "date": {{
                "value_1": "2005-02-08T10:49:00Z",
                "value_2": "2005-02-08T10:49:00Z (only fill in this value when the condition is between)",
                "condition": "one of the following: before, after, between"
            }},
            "genres":  "one of the values from the following list '{genres}'. If not specified, leave empty.",
            "runtime": {{
                "value_1": 180,
                "value_2": null (only fill in this value when the condition is between),
                "condition": "one of the following: greater_than, less_than, between. If not specified, leave empty."
            }},
            "rating": {{
                "value_1": null,
                "value_2": null (only fill in this value when the condition is between),
                "condition": "one of the following: greater_than, less_than, between. If not specified, leave empty."
            }},
            "budget": {{
                "value_1": 3500000,
                "value_2": 6500000 (only fill in this value when the condition is between),
                "condition": "one of the following: greater_than, less_than, between. If not specified, leave empty."
            }},
            "revenue": {{
                "value_1": 3500000,
                "value_2": 6500000 (only fill in this value when the condition is between),
                "condition": "one of the following: greater_than, less_than, between. If not specified, leave empty."
            }},
            "sentiment":  "If the user mentions terms like 'sad' or 'bad', consider it as 'negative'. If terms like 'happy' or 'good' are mentioned, consider it as 'positive'. If sentiment is not specified in the user query, leave it empty.",
            "language":  "one of the values from the following list '{country_names}'. If not specified, leave empty."
        }}
    """

    messages = [{
            "role": "system",
            "content": "You are a helpful assistant designed to output only in JSON format. No other text or explanation.",
        },
        {
            "role": "user",
            "content": prompt_template
        }
    ]

    stream = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        # stream=True,
        # temperature=0.7,
        # max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        # stop=None,
        response_format={"type": "json_object"}
    )
    return json.loads(stream.choices[0].message.content)

In [27]:
def create_filter(user_query):
    parsed = json.dumps(parse_user_query(user_query))

    print('Filters:')
    print(parsed)

    data = json.loads(parsed)

    # Mapping values to variables
    date_value_1 = data['date']['value_1']
    date_value_2 = data['date']['value_2']
    date_condition = data['date']['condition']
    
    runtime_value_1 = data['runtime']['value_1']
    runtime_value_2 = data['runtime']['value_2']
    runtime_condition = data['runtime']['condition']
    
    rating_value_1 = data['rating']['value_1']
    rating_value_2 = data['rating']['value_2']
    rating_condition = data['rating']['condition']
    
    budget_value_1 = data['budget']['value_1']
    budget_value_2 = data['budget']['value_2']
    budget_condition = data['budget']['condition']
    
    revenue_value_1 = data['revenue']['value_1']
    revenue_value_2 = data['revenue']['value_2']
    revenue_condition = data['revenue']['condition']

    genres = data['genres']
    sentiment = data['sentiment']
    language = data['language']
    
    # Build filter conditions
    filter_conditions = []
    
    if date_condition is not None and date_condition != '':
        if date_condition == 'after':
            filter_conditions.append(models.FieldCondition(
                key="date",
                range=models.DatetimeRange(
                    gt=date_value_1, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif date_condition == 'before':
            filter_conditions.append(models.FieldCondition(
                key="date",
                range=models.DatetimeRange(
                    gt=None,
                    gte=None,
                    lt=date_value_1,
                    lte=None,
                )
            ))
        elif date_condition == 'between':
            filter_conditions.append(models.FieldCondition(
                key="date",
                range=models.DatetimeRange(
                    gt=date_value_1,
                    gte=None,
                    lt=date_value_2,
                    lte=None,
                )
            ))

    # if genres is not None:
    #     filter_conditions.append(models.FieldCondition(
    #         key="genres",
    #         match=models.MatchValue(value=genres),
    #     ))

    if runtime_condition is not None and runtime_condition != '':
        if runtime_condition == 'greater_than':
            filter_conditions.append(models.FieldCondition(
                key="runtime",
                range=models.Range(
                    gt=runtime_value_1, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif runtime_condition == 'less_than':
            filter_conditions.append(models.FieldCondition(
                key="runtime",
                range=models.Range(
                    gt=None,
                    gte=None,
                    lt=runtime_value_1,
                    lte=None,
                )
            ))
        elif runtime_condition == 'between':
            filter_conditions.append(models.FieldCondition(
                key="runtime",
                range=models.Range(
                    gt=runtime_value_1,
                    gte=None,
                    lt=runtime_value_2,
                    lte=None,
                )
            ))

    if rating_condition is not None and rating_condition != '':
        filter_conditions.append(models.FieldCondition(
            key="votes",
            range=models.Range(
                gt=2000, # greater than
                gte=None, # greater than or equal
                lt=None, # less than
                lte=None, # less than or equal
            )
        ))
        
        if rating_condition == 'greater_than':
            filter_conditions.append(models.FieldCondition(
                key="rating",
                range=models.Range(
                    gt=rating_value_1, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif rating_condition == 'less_than':
            filter_conditions.append(models.FieldCondition(
                key="rating",
                range=models.Range(
                    gt=None,
                    gte=None,
                    lt=rating_value_1,
                    lte=None,
                )
            ))
        elif rating_condition == 'between':
            filter_conditions.append(models.FieldCondition(
                key="rating",
                range=models.Range(
                    gt=rating_value_1,
                    gte=None,
                    lt=rating_value_2,
                    lte=None,
                )
            ))

    if budget_condition is not None and budget_condition != '':
        if budget_condition == 'greater_than':
            filter_conditions.append(models.FieldCondition(
                key="budget",
                range=models.Range(
                    gt=budget_value_1, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif budget_condition == 'less_than':
            filter_conditions.append(models.FieldCondition(
                key="budget",
                range=models.Range(
                    gt=None,
                    gte=None,
                    lt=budget_value_1,
                    lte=None,
                )
            ))
        elif budget_condition == 'between':
            filter_conditions.append(models.FieldCondition(
                key="budget",
                range=models.Range(
                    gt=budget_value_1,
                    gte=None,
                    lt=budget_value_2,
                    lte=None,
                )
            ))

    if revenue_condition is not None and revenue_condition != '':
        if revenue_condition == 'greater_than':
            filter_conditions.append(models.FieldCondition(
                key="revenue",
                range=models.Range(
                    gt=revenue_value_1, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif revenue_condition == 'less_than':
            filter_conditions.append(models.FieldCondition(
                key="revenue",
                range=models.Range(
                    gt=None,
                    gte=None,
                    lt=revenue_value_1,
                    lte=None,
                )
            ))
        elif revenue_condition == 'between':
            filter_conditions.append(models.FieldCondition(
                key="revenue",
                range=models.Range(
                    gt=revenue_value_1,
                    gte=None,
                    lt=revenue_value_2,
                    lte=None,
                )
            ))

    if genres is not None and genres != '':
        filter_conditions.append(models.FieldCondition(
            key="genres",
            match=models.MatchValue(value=genres),
        ))

    if sentiment is not None and sentiment != '':
        if sentiment == 'positive':
            filter_conditions.append(models.FieldCondition(
                key="sentiment_score",
                range=models.Range(
                    gt=0, # greater than
                    gte=None, # greater than or equal
                    lt=None, # less than
                    lte=None, # less than or equal
                )
            ))
        elif sentiment == 'negative':
            filter_conditions.append(models.FieldCondition(
                key="sentiment_score",
                range=models.Range(
                    gt=None,
                    gte=None,
                    lt=0,
                    lte=None,
                )
            ))

    if language is not None and language != '':
        filter_conditions.append(models.FieldCondition(
            key="language",
            match=models.MatchValue(value=language),
        ))

    return filter_conditions

In [28]:
def search_filtered_vector(user_query, collection_name, vector_name, top_k=5):

    filter_conditions = create_filter(user_query)
    
    completion = openai.embeddings.create(
        input=user_query,
        model='text-embedding-3-small'  # Be sure to use the same embedding model as the vectors in the collection
    )
    
    embedded_query = completion.data[0].embedding

    query_results = qdrant_client.search(
        collection_name=collection_name,
        query_filter=models.Filter(
            must=filter_conditions,
        ),
        search_params=models.SearchParams(hnsw_ef=128, exact=False),
        query_vector=(
            vector_name, embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [33]:
query_results = search_filtered_vector(user_query='movie about eugenics after 1995 that made more than $5000', collection_name=collection_name, vector_name=vector_name)

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['votes']} ")

Filters:
{"date": {"value_1": "1996-01-01T00:00:00Z", "value_2": "", "condition": "after"}, "genres": "", "runtime": {"value_1": null, "value_2": null, "condition": ""}, "rating": {"value_1": null, "value_2": null, "condition": ""}, "budget": {"value_1": 5000, "value_2": null, "condition": "greater_than"}, "revenue": {"value_1": 5000, "value_2": null, "condition": "greater_than"}, "sentiment": "", "language": ""}
1. Gattaca (1997) 1846 
2. Idiocracy (2006) 681 
3. Ultraviolet (2006) 468 
4. Elysium (2013) 3510 
5. Equilibrium (2002) 1584 


In [34]:
query_results = search_filtered_vector(user_query='godfather', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} ")

Filters:
{"date": {"value_1": "", "value_2": "", "condition": ""}, "genres": "", "runtime": {"value_1": null, "value_2": null, "condition": ""}, "rating": {"value_1": null, "value_2": null, "condition": ""}, "budget": {"value_1": "", "value_2": "", "condition": ""}, "revenue": {"value_1": "", "value_2": "", "condition": ""}, "sentiment": "", "language": ""}
1. The Godfather (1972) 
2. The Godfather: Part III (1990) 
3. Mafia! (1998) 
4. The Godfather: Part II (1974) 
5. Goodfellas (1990) 


In [35]:
query_results = search_filtered_vector(user_query='sad movie about a couple break up from relationship', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['sentiment_score']} {vector.payload['summary']} ")

Filters:
{"date": {"value_1": "", "value_2": "", "condition": ""}, "genres": "", "runtime": {"value_1": "", "value_2": null, "condition": ""}, "rating": {"value_1": "", "value_2": null, "condition": ""}, "budget": {"value_1": "", "value_2": "", "condition": ""}, "revenue": {"value_1": "", "value_2": "", "condition": ""}, "sentiment": "negative", "language": ""}
1. Wristcutters: A Love Story (2006) -0.9312 Zia, distraught over breaking up with his girlfriend, decides to end it all. Unfortunately, he discovers that there is no real ending, only a run-down afterlife that is strikingly similar to his old one, just a bit worse. Discovering that his ex-girlfriend has also "offed" herself, he sets out on a road trip, with his Russian rocker friend, to find her. Their journey takes them through an absurd purgatory where they discover that being dead doesn't mean you have to stop livin'! 
2. Two Lovers (2008) -0.7906 A depressed man moves back in with his parents following a recent heartbreak. 

In [38]:
query_results = search_filtered_vector(user_query='happy movie about a couple break up from relationship', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['sentiment_score']} {vector.payload['summary']} ")

Filters:
{"date": {"value_1": "", "value_2": "", "condition": ""}, "genres": "", "runtime": {"value_1": "", "value_2": null, "condition": ""}, "rating": {"value_1": "", "value_2": null, "condition": ""}, "budget": {"value_1": "", "value_2": "", "condition": ""}, "revenue": {"value_1": "", "value_2": "", "condition": ""}, "sentiment": "positive", "language": ""}
1. Happythankyoumoreplease (2010) 0.9643 Captures a generational moment - young people on the cusp of truly growing up, tiring of their reflexive cynicism, each in their own ways struggling to connect and define what it means to love and be loved. Six New Yorkers juggle love, friendship, and the keenly challenging specter of adulthood. Sam Wexler is a struggling writer who's having a particularly bad day. When a young boy gets separated from his family on the subway, Sam makes the questionable decision to bring the child back to his apartment and thus begins a rewarding, yet complicated, friendship. Sam's life revolves around hi

In [10]:
# Format the response as json
import json
from datetime import datetime
import locale
import ast

def format_time_to_minutes(minutes_float):
    minutes_int = int(minutes_float)

    hours = minutes_int // 60
    minutes = minutes_int % 60
    
    if hours > 0:
        time_string = f"{hours}h {minutes}m"
    else:
        time_string = f"{minutes}m"
    
    return time_string

def format_as_dollars(number):
    locale.setlocale(locale.LC_ALL, '')

    number = int(number)
    formatted_number = locale.currency(number, grouping=True)
    formatted_number = formatted_number.replace(locale.localeconv()['currency_symbol'], "$")

    return formatted_number

def convert_utc_to_mm_dd_yyyy(utc_datetime_str):
    # Parse the UTC datetime string
    utc_datetime = datetime.strptime(utc_datetime_str, "%Y-%m-%dT%H:%M:%SZ")
    
    # Format the datetime to MM-DD-YYYY
    mm_dd_yyyy_format = utc_datetime.strftime("%m-%d-%Y")
    
    return mm_dd_yyyy_format

In [13]:
# Function to search for similar vectors
def search_movies_in_qdrant(user_query):
    query_results = search_filtered_vector(user_query, collection_name, vector_name)

    results = []
    
    for i, vector in enumerate(query_results):
        tmp = {
            "rank": i,
            "title": vector.payload["title"],
            "summary": vector.payload["summary"],
            "date": convert_utc_to_mm_dd_yyyy(vector.payload["date"]), # convert this to MM-DD-YYYY format
            "genres": vector.payload["genres"],
            "runtime": format_time_to_minutes(vector.payload["runtime"]),
            "rating": vector.payload["rating"],
            "votes": int(vector.payload["votes"]),
            "budget": format_as_dollars(vector.payload["budget"]),
            "revenue": format_as_dollars(vector.payload["revenue"]),
            "language": vector.payload["language"],
            "production": vector.payload["production"],
            "poster_link": vector.payload["poster_link"]
        }
        results.append(tmp)

    return results

In [15]:
query = 'movie about eugenics produced after 2000'
response = (search_movies_in_qdrant(query))

json_string = json.dumps(response, indent=2)
print(json_string)

Filters:
{"date": {"value_1": "2000-01-01T00:00:00Z", "value_2": null, "condition": "after"}, "runtime": {"value_1": null, "value_2": null, "condition": ""}, "rating": {"value_1": null, "value_2": null, "condition": ""}, "budget": {"value_1": null, "value_2": null, "condition": ""}, "revenue": {"value_1": null, "value_2": null, "condition": ""}, "language": ""}
[
  {
    "rank": 0,
    "title": "Ultraviolet (2006)",
    "summary": "In the late 21st century, a subculture of humans have emerged who have been modified genetically by a vampire-like disease, giving them enhanced speed, incredible stamina and acute intelligence. As they are set apart from \"normal\" and \"healthy\" humans, the world is pushed to the brink of worldwide civil war  aimed at the destruction of the \"diseased\" population. In the middle of this crossed-fire is - an infected woman - Ultraviolet, who finds herself protecting a nine-year-old boy who has been marked for death by the human government as he is believed

In [43]:
query_results = search_filtered_vector(user_query='women empowerment after 2005', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['sentiment_score']} {vector.payload['summary']} ")

Filters:
{"date": {"value_1": "2005-01-01T00:00:00Z", "value_2": null, "condition": "after"}, "genres": "", "runtime": {"value_1": null, "value_2": null, "condition": ""}, "rating": {"value_1": null, "value_2": null, "condition": ""}, "budget": {"value_1": null, "value_2": null, "condition": ""}, "revenue": {"value_1": null, "value_2": null, "condition": ""}, "sentiment": "", "language": ""}
1. Suffragette (2015) -0.0516 Based on true events about the foot soldiers of the early feminist movement who were forced underground to evade the State. 
2. The Women (2008) 0.1779 The story centers on a group of gossipy, high-society women who spend their days at the beauty salon and haunting fashion shows. The sweet, happily-wedded Mary Haines finds her marriage in trouble when shop girl Crystal Allen gets her hooks into Mary's man. 
3. North Country (2005) 0.2568 A fictionalized account of the first major successful sexual harassment case in the United States -- Jenson vs. Eveleth Mines, where 

In [44]:
query_results = search_filtered_vector(user_query='evil desert wizard', collection_name=collection_name, vector_name='metadata')

for i, vector in enumerate(query_results):
    print(f"{i + 1}. {vector.payload['title']} {vector.payload['sentiment_score']} {vector.payload['summary']} ")

Filters:
{"date": {"value_1": "", "value_2": "", "condition": ""}, "genres": "", "runtime": {"value_1": "", "value_2": null, "condition": ""}, "rating": {"value_1": null, "value_2": null, "condition": ""}, "budget": {"value_1": "", "value_2": "", "condition": ""}, "revenue": {"value_1": "", "value_2": "", "condition": ""}, "sentiment": "", "language": ""}
1. Wishmaster 2: Evil Never Dies (1999) -0.9042 During a failed art heist, the Djinn is once again liberated. This time, to complete the 1001 wishes that he needs before the final 3, he lets himself go to prison, where he starts his evil reign twisting the hopes of the prisoners. Meanwhile, the woman who set him free accidentally, Morgana, tries to find a way to stop him, aided by a young priest. 
2. The Devil's Tomb (2009) -0.9628 Captain Mack leads an elite military unit on a search for a missing scientist, and comes face-to-face with an an ancient evil lying beneath the Middle Eastern desert. Evil that is not of this world. Evil th