In [3]:
import pandas as pd
from utils import process_list_columns

# load dataset
data = pd.read_csv('dataset/movies_with_description.csv')

# convert string lists to lists
data = process_list_columns(data, ['directors', 'writers', 'stars', 'genres', 'countries_origin', 'Languages', 'production_companies'])

# count number of null values in each column
null_counts = data.isnull().sum()
print(null_counts)

id                           0
Title                        0
Movie Link                   0
Year                         0
Duration                   221
MPA                       7976
Rating                     138
Votes                      138
budget                   21785
grossWorldWide           15378
gross_US_Canada          16029
opening_weekend_Gross    18077
directors                    0
writers                      0
stars                        0
genres                       0
countries_origin             0
filming_locations         6729
production_companies         0
Languages                    0
wins                         0
nominations                  0
oscars                       0
description                401
dtype: int64


In [9]:
#removing movies with no descriptions which means no much info about them on imdb
data_all_with_descriptions = data[data['description'].notna()]

data_all_with_descriptions.to_csv('dataset/movies_all_description.csv', index=False)

In [18]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'mps'})
vector_store = Chroma(persist_directory = "chroma_db", 
                      collection_name='movie_descriptions',
                      embedding_function=embedding_model)

# Perform a similarity search
query = "The biggest heist in history"
results = vector_store.similarity_search_with_score(query, k=3)

# Display the results
for i, result in enumerate(results):
    print(f"Result {i + 1}:")
    print(f"  Title: {result[0].metadata['Title']}")
    print(f"  Description: {result[0].page_content}")
    print(f"  Similarity Score: {1-result[1]}")
    print()

Result 1:
  Title: Vault
  Description: In 1975, a group of small time criminals attempt to pull off the biggest heist in American history: stealing over $30 million from the Mafia in the smallest state in the union, Rhode Island.
  Similarity Score: 0.4812847971916199

Result 2:
  Title: This Thing of Ours
  Description: Using the Internet and global satellites, a group of gangsters pull off the biggest bank heist in the Mafia's history.
  Similarity Score: 0.438146710395813

Result 3:
  Title: Masterminds
  Description: A guard at an armored car company in the Southern U.S. organizes one of the biggest bank heists in American history. Based on the October 1997 Loomis Fargo robbery.
  Similarity Score: 0.42707836627960205

