# MovieSphere Movie Recommender

## Data Cleaning

In [1]:
import pandas as pd

movies_metadata = pd.read_csv('../input/movies/movies_metadata.csv', low_memory=False)

columns_to_drop = [
    'adult', 
    'belongs_to_collection', 
    'budget', 
    'homepage', 
    'original_language', 
    'poster_path', 
    'production_countries', 
    'revenue', 
    'status', 
    'video'
]

movies_metadata_cleaned = movies_metadata.drop(columns=columns_to_drop)

print(movies_metadata_cleaned.shape)
movies_metadata_cleaned.head()

(45466, 14)


Unnamed: 0,genres,id,imdb_id,original_title,overview,popularity,production_companies,release_date,runtime,spoken_languages,tagline,title,vote_average,vote_count
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]",1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...",1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


### Removing invalid rows

In [2]:
df = movies_metadata_cleaned;
df['id_numeric'] = pd.to_numeric(df['id'], errors='coerce')

invalid_rows = df[df['id_numeric'].isna()]

print("Corrupted rows:")
print(invalid_rows[['id', 'title']])

df_cleaned = df[df['id_numeric'].notna()].copy()

df_cleaned.drop(columns=['id_numeric'], inplace=True)

df_cleaned.reset_index(drop=True, inplace=True)

df_cleaned.head()

Corrupted rows:
               id title
19730  1997-08-20   NaN
29503  2012-09-29   NaN
35587  2014-01-01   NaN


Unnamed: 0,genres,id,imdb_id,original_title,overview,popularity,production_companies,release_date,runtime,spoken_languages,tagline,title,vote_average,vote_count
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]",1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...",1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


### Parsing Columns

In [3]:
import ast

def extract_names(col):
    try:
        values = ast.literal_eval(col)
        if isinstance(values, list):
            return [val['name'] for val in values if 'name' in val]
    except:
        return []
    return []

In [4]:
#'genres' column
df_cleaned['genres'] = df_cleaned['genres'].apply(extract_names)

In [5]:
# 'production_companies' column
df_cleaned['production_companies'] = df_cleaned['production_companies'].apply(extract_names)

In [6]:
df_cleaned.head()

Unnamed: 0,genres,id,imdb_id,original_title,overview,popularity,production_companies,release_date,runtime,spoken_languages,tagline,title,vote_average,vote_count
0,"[Animation, Comedy, Family]",862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0
1,"[Adventure, Fantasy, Family]",8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"[Romance, Comedy]",15602,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,"[Comedy, Drama, Romance]",31357,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,[Comedy],11862,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


### Movie Credits Dataset

In [7]:
credits = pd.read_csv('/kaggle/input/movies/credits.csv')

In [8]:
credits['cast'] = credits['cast'].apply(ast.literal_eval)
credits['crew'] = credits['crew'].apply(ast.literal_eval)

In [9]:
# Extracting Top 3 cast members for better results
credits['top_cast'] = credits['cast'].apply(lambda x: [member['name'] for member in x[:3]])

In [10]:
# Extracting List of Directors
def get_directors(crew_list):
    return [member['name'] for member in crew_list if member.get('job') == 'Director']

credits['directors'] = credits['crew'].apply(get_directors)

In [11]:
credits_cleaned = credits[['id', 'top_cast', 'directors']].copy()

In [12]:
credits_cleaned.head()

Unnamed: 0,id,top_cast,directors
0,862,"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston]
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]


### Merging both Datasets

In [13]:
# Ensuring 'id' are of same type
df_cleaned['id'] = pd.to_numeric(df_cleaned['id'], errors='coerce')
credits_cleaned['id'] = pd.to_numeric(credits_cleaned['id'], errors='coerce')

In [14]:
# Dropping rows with null IDs
df_cleaned = df_cleaned.dropna(subset=['id'])
credits_cleaned = credits_cleaned.dropna(subset=['id'])

In [15]:
# Performing merge on 'id'
final_df = df_cleaned.merge(credits_cleaned, on='id')

In [16]:
final_df = final_df[['id', 'genres', 'overview', 'production_companies', 'title', 'top_cast', 'directors']].copy()

In [17]:
final_df.head()

Unnamed: 0,id,genres,overview,production_companies,title,top_cast,directors
0,862,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[Pixar Animation Studios],Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[TriStar Pictures, Teitler Film, Interscope Co...",Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston]
2,15602,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,"[Warner Bros., Lancaster Gate]",Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",[Twentieth Century Fox Film Corporation],Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,[Comedy],Just when George Banks has recovered from his ...,"[Sandollar Productions, Touchstone Pictures]",Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]


### Handling Null Values

In [18]:
final_df.isnull().sum()

id                        0
genres                    0
overview                954
production_companies      0
title                     3
top_cast                  0
directors                 0
dtype: int64

In [19]:
final_df.dropna(inplace=True)

### Handling Duplicate Values

In [20]:
final_df.duplicated(subset='id').sum()

106

In [21]:
# Dropping duplicates based on 'id'
final_df = final_df.drop_duplicates(subset='id', keep='first')

In [22]:
final_df.reset_index(drop=True, inplace=True)

In [23]:
final_df.head()

Unnamed: 0,id,genres,overview,production_companies,title,top_cast,directors
0,862,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[Pixar Animation Studios],Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[TriStar Pictures, Teitler Film, Interscope Co...",Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston]
2,15602,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,"[Warner Bros., Lancaster Gate]",Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",[Twentieth Century Fox Film Corporation],Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,[Comedy],Just when George Banks has recovered from his ...,"[Sandollar Productions, Touchstone Pictures]",Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]


### Saving the cleaned Dataset

In [24]:
# Save the cleaned DataFrame to a new CSV file
final_df.to_csv('movies_final.csv', index=False)

## ***Content-Based Recommender System*** 

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [26]:
# Load Dataset
df = pd.read_csv("/kaggle/input/movies/movies_final.csv")

In [27]:
df.head()

Unnamed: 0,id,genres,overview,production_companies,title,top_cast,directors
0,862,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",['Pixar Animation Studios'],Toy Story,"['Tom Hanks', 'Tim Allen', 'Don Rickles']",['John Lasseter']
1,8844,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"['TriStar Pictures', 'Teitler Film', 'Intersco...",Jumanji,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",['Joe Johnston']
2,15602,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,"['Warner Bros.', 'Lancaster Gate']",Grumpier Old Men,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret']",['Howard Deutch']
3,31357,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",['Twentieth Century Fox Film Corporation'],Waiting to Exhale,"['Whitney Houston', 'Angela Bassett', 'Loretta...",['Forest Whitaker']
4,11862,['Comedy'],Just when George Banks has recovered from his ...,"['Sandollar Productions', 'Touchstone Pictures']",Father of the Bride Part II,"['Steve Martin', 'Diane Keaton', 'Martin Short']",['Charles Shyer']


### Parsing Columns

In [28]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notnull(x) else []
    except:
        return []

df['genres'] = df['genres'].apply(safe_literal_eval)
df['production_companies'] = df['production_companies'].apply(safe_literal_eval)
df['top_cast'] = df['top_cast'].apply(safe_literal_eval)
df['directors'] = df['directors'].apply(safe_literal_eval)

## Feature Engineering

In [29]:
def combine_features(row):
    return (
        row['overview'] + ' ' +
        ' '.join(row['genres']) + ' ' +
        ' '.join(row['top_cast']) + ' ' +
        row['title'] + ' ' +
        ' '.join(row['directors']) + ' ' +
        ' '.join(row['production_companies'])
    )

df['combined_text'] = df.apply(combine_features, axis=1)

In [30]:
df.head()

Unnamed: 0,id,genres,overview,production_companies,title,top_cast,directors,combined_text
0,862,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[Pixar Animation Studios],Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],"Led by Woody, Andy's toys live happily in his ..."
1,8844,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[TriStar Pictures, Teitler Film, Interscope Co...",Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],When siblings Judy and Peter discover an encha...
2,15602,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,"[Warner Bros., Lancaster Gate]",Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],A family wedding reignites the ancient feud be...
3,31357,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",[Twentieth Century Fox Film Corporation],Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],"Cheated on, mistreated and stepped on, the wom..."
4,11862,[Comedy],Just when George Banks has recovered from his ...,"[Sandollar Productions, Touchstone Pictures]",Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],Just when George Banks has recovered from his ...


In [31]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

In [32]:
# Nearest Neighbors
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

## Recommendation Function

#### Fuzzy Searches

In [33]:
!pip install rapidfuzz



In [34]:
from rapidfuzz import process

def fuzzy_match_title(query, title_series, limit=5, threshold=70):
    matches = process.extract(query, title_series, limit=limit, score_cutoff=threshold)
    return matches

In [35]:
def recommend(title, n=5):
    title = title.lower().strip()
    
    matches = fuzzy_match_title(title, df['title'].str.lower())
    
    if not matches:
        return { "error": f"No similar title found for '{title}'" }

    best_title, score, idx = matches[0]
    
    vec = tfidf_matrix.getrow(idx)
    distances, indices = nn.kneighbors(vec, n_neighbors=n+1)

    recommended_indices = indices[0]
    return df.iloc[recommended_indices][['title']]

In [36]:
# Example
print(recommend("Captan Amrica", n=10))

                                                   title
8970                                     Captain America
30493          Iron Man & Captain America: Heroes United
17340                 Captain America: The First Avenger
22845                Captain America: The Winter Soldier
31787                                    Captain America
18602                                   Captain Midnight
20812                 Captain America II: Death Too Soon
35945  Captain Nissen Going Through Whirlpool Rapids,...
10790                                  Ultimate Avengers
22041                         Star Trek: Of Gods And Men
40363                                          Team Thor


### Saving All the Components

In [37]:
import joblib
import scipy.sparse

joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(nn, "nearest_neighbors_model.pkl")
joblib.dump(df, "movies_data.pkl")
scipy.sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)