# 1 Setup

In [1]:
# Install Required Libraries
!pip install fuzzywuzzy[speedup]
!pip install -q sentence-transformers

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Imports
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import process

# 2 Data Preparation

In [3]:
# Load Dataset
df = pd.read_csv('../data/tmdb_5000_movies.csv')
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
# Retrieving the first row
x = df.iloc[0]
x

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

In [6]:
x['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [7]:
x['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [8]:
# Preprocess Genres + Keywords into string
def combine_genres_keywords(row):
    genres = ' '.join(''.join(g['name'].split()) for g in json.loads(row['genres']))
    keywords = ' '.join(''.join(k['name'].split()) for k in json.loads(row['keywords']))
    return f"{genres} {keywords}"

df['tfidf_text'] = df.apply(combine_genres_keywords, axis=1)

In [9]:
# Combine genres + keywords + overview for semantic embedding
def build_embedding_text(row):
    genres = ' '.join(''.join(g['name'].split()) for g in json.loads(row['genres']))
    keywords = ' '.join(''.join(k['name'].split()) for k in json.loads(row['keywords']))
    overview = row.get('overview', '')
    return f"{genres} {keywords} {overview}"

df['embedding_text'] = df.apply(build_embedding_text, axis=1)

# 3 Text Vectorization

In [10]:
# TF-IDF + SVD
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['tfidf_text'])

In [11]:
svd = TruncatedSVD(n_components=50)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

In [12]:
# Sentence Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_matrix = embedding_model.encode(df['embedding_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

In [13]:
# Save embedding matrix to file
np.save("../artifacts/embedding_matrix.npy", embedding_matrix)

# 4 Matching & Indexing

In [14]:
# Movie Title → Index Mapping
title_to_index = pd.Series(df.index, index=df['title'])

In [15]:
# Title Matching Utility
def match_title(input_title, titles, threshold=80):
    match, score = process.extractOne(input_title, titles)
    return match if score >= threshold else None

# 5 Recommendation Enginees

In [16]:
# TF-IDF Recommender
def recommend_tfidf(input_title, top_n=10):
    matched_title = match_title(input_title, title_to_index.index)
    if not matched_title:
        return f"Movie '{input_title}' not found."

    idx = title_to_index[matched_title]
    query_vector = tfidf_reduced[idx].reshape(1, -1)
    similarity_scores = cosine_similarity(query_vector, tfidf_reduced).flatten()
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]

    return [(df.iloc[i]['title'], similarity_scores[i]) for i in top_indices]

In [17]:
# Embedding Recommender
def recommend_embedding(input_title, top_n=10):
    matched_title = match_title(input_title, title_to_index.index)
    if not matched_title:
        return f"Movie '{input_title}' not found."

    idx = title_to_index[matched_title]
    query_vector = embedding_matrix[idx].reshape(1, -1)
    similarity_scores = cosine_similarity(query_vector, embedding_matrix).flatten()
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]

    return [(df.iloc[i]['title'], similarity_scores[i]) for i in top_indices]

In [18]:
# Semantic Search from Free Text
def semantic_search(query, top_n=10):
    query_vector = embedding_model.encode([query])
    similarity_scores = cosine_similarity(query_vector, embedding_matrix).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_n]

    return [(df.iloc[i]['title'], similarity_scores[i]) for i in top_indices]

In [19]:
# Hybrid Search (TF-IDF + Embedding)
def hybrid_search(query, alpha=0.8, top_n=10):
    query_embed = embedding_model.encode([query])
    query_tfidf = tfidf_vectorizer.transform([query])

    sim_embed = cosine_similarity(query_embed, embedding_matrix).flatten()
    sim_tfidf = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    combined_score = alpha * sim_embed + (1 - alpha) * sim_tfidf
    top_indices = combined_score.argsort()[::-1][:top_n]

    return [(df.iloc[i]['title'], combined_score[i]) for i in top_indices]

# 6 Testing & Evaluation

In [20]:
# Test & Compare All Methods
test_input = "I want to watch Marvel movies"
top_n = 10

print(f"\n🎬 Input Query: '{test_input}'\n")

# TF-IDF Results
print("📘 TF-IDF Recommendations:")
for title, score in recommend_tfidf(test_input, top_n):
    print(f"{title:50} | Score: {score:.4f}")

# Embedding Results
print("\n🤖 Embedding Recommendations:")
for title, score in recommend_embedding(test_input, top_n):
    print(f"{title:50} | Score: {score:.4f}")

# Hybrid Results
print("\n🔀 Hybrid Recommendations (alpha=0.8):")
for title, score in hybrid_search(test_input, alpha=0.8, top_n=top_n):
    print(f"{title:50} | Score: {score:.4f}")


🎬 Input Query: 'I want to watch Marvel movies'

📘 TF-IDF Recommendations:
World War Z                                        | Score: 0.9511
The Book of Eli                                    | Score: 0.9312
Resident Evil: Extinction                          | Score: 0.9216
Battle for the Planet of the Apes                  | Score: 0.9014
Priest                                             | Score: 0.8917
Mad Max                                            | Score: 0.8898
Terminator 2: Judgment Day                         | Score: 0.8829
Mad Max: Fury Road                                 | Score: 0.8785
They Live                                          | Score: 0.8749
eXistenZ                                           | Score: 0.8749

🤖 Embedding Recommendations:
28 Weeks Later                                     | Score: 0.5742
28 Days Later                                      | Score: 0.5582
Contagion                                          | Score: 0.5519
World War Z             