In [1]:
# %% [markdown]
# # Project: Movie Plot Recommendation Engine
# 
# **Goal:** To recommend 5 similar movies based on plot overview, using TF-IDF and Cosine Similarity.

# %% [markdown]
# ---
# ### Step 1: Import Libraries & Load Data
# ---

# %%
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# %%
# Load the dataset
try:
    df = pd.read_csv('tmdb_5000_movies.csv')
except FileNotFoundError:
    print("Error: 'tmdb_5000_movies.csv' not found.")
    print("Please download it from: https://www.kaggle.com/datasets/tmdb/tmdb-5000-movie-dataset")
    exit()

print(f"Loaded {len(df)} movies.")
df.head(2)

# %%
# We only care about the title and the plot summary (overview)
df_small = df[['id', 'title', 'overview']].copy()

# %% [markdown]
# ---
# ### Step 2: Data Cleaning & TF-IDF Vectorization
# ---

# %%
# Check for missing data in 'overview'
print(f"Missing overviews before: {df_small['overview'].isnull().sum()}")

# For any movie with a missing overview, fill it with an empty string
# so the vectorizer doesn't crash.
df_small['overview'] = df_small['overview'].fillna('')

print(f"Missing overviews after: {df_small['overview'].isnull().sum()}")

# %%
# Initialize the TF-IDF Vectorizer
# stop_words='english' removes common words like 'the', 'is', 'in'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Create the TF-IDF matrix by fitting and transforming the 'overview' data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_small['overview'])

# Check the shape: (number_of_movies, number_of_unique_words)
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

# %% [markdown]
# ---
# ### Step 3: Calculate Cosine Similarity
# ---
# This is the core of our engine.
# We will compare the TF-IDF matrix with itself.

# %%
# cosine_similarity calculates the similarity of all pairs of documents
print("Calculating Cosine Similarity Matrix...")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Similarity Matrix Shape: {cosine_sim.shape}")

# %%
# Let's look at a small piece of it.
# It's a square matrix, and the diagonal (movie 1 vs movie 1) is always 1.
print(cosine_sim[0:5, 0:5])

# %% [markdown]
# ---
# ### Step 4: Build the Recommendation Function
# ---
# We need a function that:
# 1. Takes a movie title.
# 2. Finds its index.
# 3. Gets its similarity scores against all other movies.
# 4. Sorts them and returns the top 5.

# %%
# We need a way to map a movie title to its index number
# We create a new Series where the index is the movie title
# and the value is the DataFrame index (0, 1, 2, ...)
indices = pd.Series(df_small.index, index=df_small['title']).drop_duplicates()

print("Title-to-Index map created.")
print(indices.head())

# %%
def get_recommendations(title, cosine_sim=cosine_sim, data=df_small, indices=indices):
    """
    Finds the top 5 most similar movies based on cosine similarity of plots.
    """
    try:
        # 1. Get the index of the movie that matches the title
        idx = indices[title]
    except KeyError:
        return f"Error: Movie '{title}' not found in the dataset."

    # 2. Get the pairwise similarity scores for this movie with all other movies
    # This is a list of (index, similarity_score) tuples
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 3. Sort the movies based on the similarity scores (in descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 4. Get the scores of the 5 most similar movies.
    # We skip index [0] because that is the movie itself (score of 1.0)
    top_scores = sim_scores[1:6]

    # 5. Get the movie indices from the top scores
    movie_indices = [i[0] for i in top_scores]

    # 6. Return the titles of the top 5 movies
    return data['title'].iloc[movie_indices]

# %% [markdown]
# ---
# ### Step 5: Test the Recommender!
# ---

# %%
# Let's find movies similar to a big, popular one.
movie_title = 'The Dark Knight Rises'

print(f"--- Recommendations for '{movie_title}' ---")
print(get_recommendations(movie_title))

# %%
# Let's try another one.
movie_title = 'Inception'

print(f"\n--- Recommendations for '{movie_title}' ---")
print(get_recommendations(movie_title))

# %%
# And a sci-fi classic
movie_title = 'The Matrix'

print(f"\n--- Recommendations for '{movie_title}' ---")
print(get_recommendations(movie_title))

# %%
# What about an animation?
movie_title = 'Toy Story'

print(f"\n--- Recommendations for '{movie_title}' ---")
print(get_recommendations(movie_title))

Loaded 4803 movies.
Missing overviews before: 3
Missing overviews after: 0
TF-IDF Matrix Shape: (4803, 20978)
Calculating Cosine Similarity Matrix...
Similarity Matrix Shape: (4803, 4803)
[[1.         0.         0.         0.02499512 0.        ]
 [0.         1.         0.         0.         0.03336868]
 [0.         0.         1.         0.         0.        ]
 [0.02499512 0.         0.         1.         0.0104334 ]
 [0.         0.03336868 0.         0.0104334  1.        ]]
Title-to-Index map created.
title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64
--- Recommendations for 'The Dark Knight Rises' ---
65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
