In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import csv
import torch
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
# Load pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

# Movie dataset (with title and plot)
movies = []        
with open('combined_movies_dataset.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        title = row['title']
        overview = row['overview']
        release_date = row.get('release_date', 'N/A')  # Safely handle missing release dates
        if release_date == 'None' or not release_date.strip():  # Handle empty or 'None' values
            release_date = 'N/A'
        movies.append({"title": title, "overview": overview, "release_date": release_date})

# Extract the movie plots
movie_plots = [movie['overview'] for movie in movies]

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Encode the plots using SBERT
start_time = time.time()
plot_embeddings = model.encode(movie_plots, convert_to_tensor=True).to(device)
encoding_time = time.time() - start_time

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [3]:
print(f"\nTime taken to encode plots: {encoding_time:.4f} seconds")


Time taken to encode plots: 303.0012 seconds


In [13]:
# User's input movie plot for which we want a recommendation
user_plot = "indian guy stuck in the sea with animals"

# Encode the user's input plot
start_time = time.time()
user_embedding = model.encode(user_plot, convert_to_tensor=True).to(device)
user_encoding_time = time.time() - start_time

In [14]:
print(f"Time taken to encode user plot: {user_encoding_time:.4f} seconds")

Time taken to encode user plot: 0.0146 seconds


In [15]:
# Compute cosine similarity between the user's input plot and all movie plots
start_time = time.time()
cosine_scores = util.pytorch_cos_sim(user_embedding, plot_embeddings)[0]
similarity_time = time.time() - start_time

# Convert the cosine scores to a numpy array
cosine_scores_np = cosine_scores.cpu().numpy()

In [16]:
# Get the top-k most similar movies (k=3)
top_k = 10
top_results = np.argsort(cosine_scores_np)[-top_k:][::-1]

# Display the top-k recommended movies
print("\nTop movie recommendations based on the plot:")
for idx in top_results:
    title = movies[idx]['title']
    release_date = movies[idx].get('release_date')  # Use 'N/A' if release_date is missing
    similarity_score = cosine_scores_np[idx]
    print(f"Title: {title}, Release Date: {release_date}, Similarity Score: {similarity_score:.4f}")



print(f"Time taken to calculate cosine similarity: {similarity_time:.4f} seconds")


Top movie recommendations based on the plot:
Title: All Is Lost, Release Date: 2013-08-23, Similarity Score: 0.6443
Title: A Moment Passing, Release Date: 1997-07-25, Similarity Score: 0.5920
Title: Life of Pi, Release Date: 2012-11-20, Similarity Score: 0.5757
Title: Fish, Release Date: 2017-03-30, Similarity Score: 0.5722
Title: Bird of Paradise, Release Date: N/A, Similarity Score: 0.5667
Title: Wet City, Release Date: 2019-03-11, Similarity Score: 0.5601
Title: Norsk Folksang, Release Date: 2011-04-25, Similarity Score: 0.5592
Title: Kodomo Challenge Petit 2021-07, Release Date: 2021-07-01, Similarity Score: 0.5580
Title: Gulp, Release Date: 2001-01-30, Similarity Score: 0.5571
Title: Furankenshutain no kaijû: Sanda tai Gaira, Release Date: N/A, Similarity Score: 0.5568
Time taken to calculate cosine similarity: 0.0048 seconds
