In [None]:
# import required libraries
import pandas as pd 
import numpy as np

In [None]:
# Load datasets
movies = pd.read_csv("tmdb_5000_movies.csv") 
credits = pd.read_csv("tmdb_5000_credits.csv") 

In [183]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [184]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
# Merge datasets on title
movies = movies.merge(credits, on="title")

In [None]:
# Select relevant features for recommendation
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [187]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
# Check for missing values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [None]:
# Remove row swith missing overview
movies.dropna(inplace = True)

In [None]:
# check for duplicate values
movies.duplicated().sum()

0

In [None]:
# Inspect JSON format of genres column
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [None]:
# Parse JSON strings and extract 'name' field
import ast 
def convert(obj):
    genres_list = []
    for i in ast.literal_eval(obj):
        genres_list.append(i['name'])

    return genres_list

In [193]:
movies['genres'] = movies['genres'].apply(convert)   

In [194]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [195]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
# Extract top 3 cast members to reduce dimensionality
import ast 
def convert3(obj):
    genres_list = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            genres_list.append(i['name'])
            counter += 1
        else:
            break

    return genres_list

In [197]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
# Extract director from crew
import ast 
def fetch_director(obj):
    directors = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            directors.append(i['name'])
            break

    return directors

In [199]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
# Tokenize overview into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
# Remove spaces from multi-word terms (e.g., "Science Fiction" -> "ScienceFict
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [None]:
# Combine all features into single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
# Create final dataset with essential columns
new_df = movies[['movie_id','title','tags']]

In [None]:
# Convert tags list to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [205]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [None]:
# Initialize Porter Stemmer for word normalization
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
# Apply stemming to reduce words to root form
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [208]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [209]:
new_df['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [None]:
# Convert to lowercase for consistency
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [None]:
# Create Bag-of-Words vectors (5000 features, remove stop words)
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
# Transform text tags into numerical vectors
vectors = cv.fit_transform(new_df['tags']).toarray()

In [213]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [253]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [257]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute pairwise cosine similarity matrix
similarity = cosine_similarity(vectors)

In [None]:
# Get top 5 similar movies to first movie
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]

[(1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [None]:
# Recommendation function: returns top 5 similar movies
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [325]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


In [319]:
new_df.iloc[1214].title

'Aliens vs Predator: Requiem'

In [327]:
import pickle

In [None]:
# Save processed movie data
pickle.dump(new_df, open("movies.pkl", "wb"))

In [333]:
new_df['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [None]:
# Save as dictionary format
pickle.dump(new_df.to_dict(), open("movies_dict.pkl", "wb"))

In [None]:
# Save similarity matrix for fast inference
pickle.dump(similarity, open("similarity.pkl", "wb"))

In [339]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.
