In [1]:
from pydantic import BaseModel
import pandas as pd
import torch
import json
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from typing import List

In [2]:
# Load the data and precompute the necessary values
df = pd.read_csv("movies_metadata (4).csv")
reviews = pd.read_csv("top_1000_movie_reviews.csv")
df = df.merge(reviews, on="imdb_id")
df_mystery = df[df["genres"].str.contains("Thriller", na=False)]
top_10_mystery_movies = df_mystery.sample(10, random_state=42)
user_history = top_10_mystery_movies[
    ["title", "overview", "imdb_id", "genres"]
].dropna()
model = SentenceTransformer("all-mpnet-base-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = model.to(device)
embedding_size = model.get_sentence_embedding_dimension()
zero_tensor = torch.zeros((embedding_size,), device=device)


# Define utility functions
def calculate_embeddings(texts):
    if not texts:
        return zero_tensor
    embeddings = model.encode(
        texts, convert_to_tensor=True, device=device, show_progress_bar=False
    )
    return embeddings


def calculate_mood_score(
    plot,
    reviews,
    genres_json,
    popularity,
    plot_weight=0.1,
    review_weight=0.7,
    genre_weight=0.2,
):
    plot_embeddings = calculate_embeddings([plot]) if plot else zero_tensor
    review_embeddings = calculate_embeddings(reviews) if reviews else zero_tensor
    avg_review_embeddings = (
        review_embeddings.mean(dim=0) if len(review_embeddings) > 0 else zero_tensor
    )
    genres = json.loads(genres_json.replace("'", '"'))
    genre_names = " ".join([genre["name"] for genre in genres])
    genre_embeddings = (
        calculate_embeddings([genre_names]) if genre_names else zero_tensor
    )
    mood_score = (
        plot_weight * plot_embeddings
        + review_weight * avg_review_embeddings
        + genre_weight * genre_embeddings
    ) / (plot_weight + review_weight + genre_weight)
    return mood_score.cpu()


def recommend_movies(user_history, sample_movies):
    user_history_titles = set(user_history["title"].tolist())
    recommended_movies = sample_movies[
        ~sample_movies["title"].isin(user_history_titles)
    ]
    top_10_movies = recommended_movies.sort_values(
        by="similarity", ascending=False
    ).head(10)
    return top_10_movies


def search(sample_movies):
    top_10_movies = sample_movies.sort_values(
        by="search_similarity", ascending=False
    ).head(10)
    return top_10_movies


# Define data preprocessing and calculations
sample_movies = df.dropna(subset=["overview", "genres", "popularity"])
scaler = StandardScaler()
popularity = sample_movies["popularity"].values.reshape(-1, 1)
sample_movies["normalized_popularity"] = scaler.fit_transform(popularity)

movie_scores = []
for _, movie in sample_movies.iterrows():
    movie_scores.append(
        calculate_mood_score(
            movie["overview"],
            movie["reviews"],
            movie["genres"],
            movie["normalized_popularity"],
        )
    )

movie_scores_tensor = torch.stack(movie_scores)

user_history_embeddings = calculate_embeddings(user_history["overview"].tolist())
user_history_mood_score = user_history_embeddings.mean(dim=0).cpu()
movie_scores_tensor = movie_scores_tensor.squeeze()
sample_movies["similarity"] = [
    cosine_similarity(user_history_mood_score.unsqueeze(0), movie_score.unsqueeze(0))
    for movie_score in movie_scores_tensor
]

print("Data preprocessing and calculations complete!")

  df = pd.read_csv("movies_metadata (4).csv")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using device: cuda
Data preprocessing and calculations complete!


In [4]:
import pickle

# Save necessary objects
with open("movie_recommender_data.pkl", "wb") as f:
    pickle.dump({
        "sample_movies": sample_movies,
        "movie_scores_tensor": movie_scores_tensor,
        "user_history_mood_score": user_history_mood_score
    }, f)


In [3]:
recommended_movies = recommend_movies(user_history, sample_movies)
display(recommended_movies[['title', 'overview', 'genres', 'similarity']])

Unnamed: 0,title,overview,genres,similarity
858,Jason Bourne,The most dangerous former operative of the CIA...,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",[[0.7394514]]
287,Spy Game,Veteran spy Nathan Muir is on the verge of ret...,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",[[0.73233455]]
776,Child 44,"Set in Stalin-era Soviet Union, a disgraced MG...","[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",[[0.729125]]
870,Mechanic: Resurrection,Arthur Bishop thought he had put his murderous...,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",[[0.72847706]]
475,Taken,"While vacationing with a friend in Paris, an A...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",[[0.7280477]]
706,No Good Deed,"Terri is a devoted wife and mother of two, liv...","[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",[[0.7249564]]
329,Identity,Complete strangers stranded at a remote desert...,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",[[0.72094107]]
522,Derailed,NATO operative Jacques Kristoff (Jean-Claude V...,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",[[0.7202923]]
214,Three Days of the Condor,A bookish CIA researcher finds all his co-work...,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",[[0.7200519]]
75,Die Hard,"NYPD cop, John McClane's plan to reconcile wit...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",[[0.71869314]]
