# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
os.getcwd()

'd:\\Projects_D\\Movie_Recommender\\notebooks'

In [2]:
os.chdir("..")
os.getcwd()

'd:\\Projects_D\\Movie_Recommender'

In [3]:
import torch
print(torch.__version__)

2.0.1+cu117


In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


# Load dataframe

In [5]:
# Load the movies dataset
# (Make sure the file "movies.csv" is in "data/raw/" folder)
data_path = "data/raw/movies.csv"
try:
    df = pd.read_csv(data_path)
    print(f"Successfully loaded {len(df)} movies from {data_path}")
except FileNotFoundError:
    print(f"Error: Could not find file at {data_path}")
    raise

Successfully loaded 8551 movies from data/raw/movies.csv


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8551 entries, 0 to 8550
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    8551 non-null   int64  
 1   id            8551 non-null   int64  
 2   title         8551 non-null   object 
 3   overview      8531 non-null   object 
 4   release_date  8551 non-null   object 
 5   popularity    8551 non-null   float64
 6   vote_average  8551 non-null   float64
 7   vote_count    8551 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 534.6+ KB


In [7]:
df.shape

(8551, 8)

In [8]:
df.isna().sum()

Unnamed: 0       0
id               0
title            0
overview        20
release_date     0
popularity       0
vote_average     0
vote_count       0
dtype: int64

In [9]:
df[df["overview"].isna()].head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
270,270,160885,Tel chi el telùn,,1999-05-12,4.733,8.0,272
394,394,564427,Minha Vida em Marte,,2018-12-27,5.243,7.9,244
2111,2111,56825,Classmates,,1988-12-12,5.395,7.2,246
3714,3714,53957,La matassa,,2009-03-13,5.562,6.7,381
4844,4844,26285,Fantozzi Still Suffers,,1983-01-01,7.195,6.4,343


In [10]:
# Drop rows with missing overviews
df = df.dropna(subset=['overview'])
df.shape

(8531, 8)

# Cosine similarity


In [11]:
# Instantiate a SentenceTransformer model (using a popular pre-trained model)
model = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
# Compute embeddings for all movie descriptions
# A progress bar is shown during encoding
print("Computing embeddings for movie descriptions...")
embeddings = model.encode(df['overview'].tolist(), show_progress_bar=True)

Computing embeddings for movie descriptions...


Batches: 100%|██████████| 267/267 [00:17<00:00, 15.03it/s]


In [13]:
# Define a function to get the top-k most similar movies based on cosine similarity
def get_similar_movies(movie_index, embeddings, top_k=5):
    """
    Given a movie index, return the indices of the top_k most similar movies based on the cosine similarity.
    """
    # Retrieve the embedding vector of the selected movie
    selected_embedding = embeddings[movie_index].reshape(1, -1)
    
    # Compute cosine similarities between the selected movie and all movies in the dataset
    similarities = cosine_similarity(selected_embedding, embeddings)[0]
    
    # Sort the indices of similarities in descending order and exclude the selected movie itself
    sorted_indices = np.argsort(-similarities)
    similar_indices = [idx for idx in sorted_indices if idx != movie_index]
    
    # Return the top_k similar movie indices
    return similar_indices[:top_k]

In [14]:
# Testing the function: Select a movie (e.g., the first movie in the DataFrame)
selected_movie_index = 0  # Change this index to test with another movie
top_similar_indices = get_similar_movies(selected_movie_index, embeddings, top_k=5)

In [15]:
# Print the title of the selected movie and its top 5 similar movies
selected_title = df.iloc[selected_movie_index]['title']
print(f"\nSelected movie: {selected_title}")
print("Top 5 similar movies based on description:")
for idx in top_similar_indices:
    similar_title = df.iloc[idx]['title']
    # Calculate similarity value for display purpose
    sim_value = cosine_similarity(embeddings[selected_movie_index].reshape(1, -1),
                                  embeddings[idx].reshape(1, -1))[0][0]
    print(f"- {similar_title} (Similarity: {sim_value:.4f})")


Selected movie: Dilwale Dulhania Le Jayenge
Top 5 similar movies based on description:
- Kabhi Khushi Kabhie Gham (Similarity: 0.5648)
- Chennai Express (Similarity: 0.5008)
- Pather Panchali (Similarity: 0.4547)
- The Lion King II: Simba's Pride (Similarity: 0.4529)
- Kuch Kuch Hota Hai (Similarity: 0.4502)
