In [177]:
# !pip install sentence-transformers
# !pip install chromadb


In [122]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

In [97]:
model = SentenceTransformer('all-MiniLM-L6-v2')
movie_df = pd.read_csv('imdb_top_1000.csv')



In [98]:
# movie_df['genres'] = movie_df['Genre'].apply(lambda x: ' '.join(x))
print('Movie Count:', movie_df.shape[0])
movie_df.shape

Movie Count: 1000


(1000, 16)

In [184]:
descriptions = movie_df['Overview'].tolist()
embeddings = model.encode(descriptions)
embeddings.shape
movie_df['Overview_embeddings'] = list(embeddings)
movie_df.shape

(1000, 20)

In [101]:
movies_to_visualize = movie_df.head(20)
embeddings = list(movies_to_visualize['Overview_embeddings'])
titles = list(movies_to_visualize['Series_Title']+"-"+movies_to_visualize['Genre'])
pca = PCA(n_components = 2)
embeddings_2d = pca.fit_transform(embeddings)

In [179]:
# plt.figure(figsize = (10,8))
# for i, (embedding, title) in enumerate(zip(embeddings_2d, titles)):
#     x,y = embedding
#     plt.scatter(x,y)
#     plt.text(x, y, title, fontsize=9)
    
# plt.title('2D Visualization of Movie Description Embeddings')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.grid(True)
# plt.show()

In [111]:
genres = movie_df['Genre'].tolist()
embeddings = model.encode(genres)
movie_df['genre_embeddings'] = list(embeddings)
movies_to_visualize = movie_df.head(20)
embeddings = list(movies_to_visualize['genre_embeddings'])
titles = list(movies_to_visualize['Series_Title']+"-"+movies_to_visualize['Genre'])

In [117]:
pca = PCA(n_components = 2)
embeddings_2d = pca.fit_transform(embeddings)

In [190]:
# plt.figure(figsize = (10,8))
# for i, (embedding, title) in enumerate(zip(embeddings_2d, titles)):
#     x,y = embedding
#     plt.scatter(x,y)
#     plt.text(x, y, title, fontsize=9)
    
# plt.title('2D Visualization of Movie Description Embeddings')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.grid(True)
# plt.show()

### Build simple search

In [124]:
def find_top_n_matches(query, df, embedding_field, n = 5):
    """
    Find the top N matches in df for the given query.
    
    :param query: The search query string.
    :param df: The DataFrame with a 'Overview_embedding' column.
    :param n: Number of top matches to return.
    :return: A DataFrame with the top N matches.
    """
    # Calculate the embedding for the query
    query_embedding = model.encode([query])[0]
    
    # Calculate the similarity with all descriptions
    df['similarity'] = df[embedding_field].apply(lambda emb: 1 - cosine(query_embedding, emb))
    
    # Sort by similarity
    top_n = df.sort_values(by = 'similarity', ascending = False).head(n)
    
    #Drop the similarity column for the final output
    return top_n.drop(columns = ['similarity'])

In [188]:
example_query = 'happy'
find_top_n_matches(example_query, movie_df, 'genre_embeddings',n  = 1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Overview_embeddings,genre_embeddings,compound_embedding
750,https://m.media-amazon.com/images/M/MV5BNGQwZj...,The Hangover,2009,UA,100 min,Comedy,7.7,Three buddies wake up from a bachelor party in...,73.0,Todd Phillips,Zach Galifianakis,Bradley Cooper,Justin Bartha,Ed Helms,717559,277322503,"[0.02279045, 0.00785676, -0.029767714, 0.02614...","[-0.042611323, -0.04620106, -0.034571048, 0.01...","[0.0052326643, -0.03675479, -0.058400933, 0.00..."


### Combine Genre and Description Embedding

In [159]:
embeddings = model.encode((movie_df['Genre'] + ',' + movie_df['Overview']).tolist()) # Generate a vector useful for searching of the following movie:
movie_df['compound_embedding'] = list(embeddings)

In [186]:
example_query = 'happy'
find_top_n_matches(example_query, movie_df, 'compound_embedding',n=1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Overview_embeddings,genre_embeddings,compound_embedding
135,https://m.media-amazon.com/images/M/MV5BMWYwOT...,Klaus,2019,PG,96 min,"Animation, Adventure, Comedy",8.2,A simple act of kindness always sparks another...,65.0,Sergio Pablos,Carlos Martínez López,Jason Schwartzman,J.K. Simmons,Rashida Jones,104761,,"[-0.07809585, 0.061933506, 0.069953576, -0.009...","[-0.038672104, -0.0037281357, -0.017469369, 0....","[-0.065673426, 0.045164164, 0.046229813, -0.02..."


### Build the ChromaDB