# Content Based Movie Recommender

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import pickle
import math

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tiongshankai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tiongshankai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tiongshankai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Read in dataset

In [2]:
metadata_path = r"./movie_lens_dataset/movies_metadata.csv"

In [3]:
df = pd.read_csv(metadata_path, low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Preprocessing

In [4]:
print("Number of movies:", len(df))

Number of movies: 45466


In [5]:
print("Number of null overviews:", df['overview'].isnull().sum())

Number of null overviews: 954


### Removing rows with null or empty overviews

In [6]:
df = df[df['overview'].notna()]
print("Number of non-null overviews", len(df))

Number of non-null overviews 44512


### Removing punctuations

In [7]:
df['overview'] = df['overview'].str.replace(r'[^\w\s]+', '')
df.head()

  df['overview'] = df['overview'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,Led by Woody Andys toys live happily in his ro...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,Cheated on mistreated and stepped on the women...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Tokenizing words
Don't run this if using sklearn lib

In [8]:
# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# df['overview'] = df['overview'].apply(lambda x: w_tokenizer.tokenize(x))
# df.head()

### Making all words lower case

In [9]:
# df['overview'] = df['overview'].apply(lambda x: [word.lower() for word in x])
df['overview'] = df['overview'].apply(lambda x: " ".join([word.lower() for word in x.split()]))
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led by woody andys toys live happily in his ro...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,when siblings judy and peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,a family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated on mistreated and stepped on the women...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,just when george banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Removing stopwords from overview

In [11]:
stop = stopwords.words ('english')
df['overview'] = df['overview'].apply(lambda x: " ".join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led woody andys toys live happily room andys b...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,siblings judy peter discover enchanted board g...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated mistreated stepped women holding breat...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,george banks recovered daughters wedding recei...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Lemmatization


In [12]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split()])

df['overview'] = df['overview'].apply(lemmatize_text)

df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led woody andys toy live happily room andys bi...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,sibling judy peter discover enchanted board ga...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated mistreated stepped woman holding breat...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,george bank recovered daughter wedding receive...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


Removing empty overviews

In [13]:
df = df[df['overview'] != ""]

Dropping duplicates

In [14]:
df = df.drop_duplicates("title")

In [15]:
df = df[df['overview'].notna()]
print("Number of non-null overviews", len(df))

Number of non-null overviews 41368


In [16]:
print("Number of null overviews:", df['overview'].isnull().sum())

Number of null overviews: 0


Resetting index

In [17]:
df = df.reset_index()

Saving the dataframe to a preprocessed csv

In [22]:
df.to_csv('./movie_lens_dataset/movies_metadata_processed.csv')
print("Processed csv saved!")

Processed csv saved!


# Generating TFIDF from dataset and serializing using pickle
Serializing allows us to save time for re-generation of the index.

Using sklearn library which runs way faster

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix = tfidf_matrix.astype(np.float32)

with open('./pickle/tfidf_matrix.pickle', 'wb') as handle:
    pickle.dump(tfidf_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Index saved!")

Index saved!


Loading matrix

In [23]:
with open('./pickle/tfidf_matrix.pickle', 'rb') as handle:
    tfidf_matrix = pickle.load(handle)
    print("Index loaded!")

Index loaded!


In [24]:
tfidf_matrix.shape

(41368, 80925)

# Creating cosine similarity model

The cosine similarity model is a (41368, 41368) matrix consisting of cosine similarity scores of all movies against each other.

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim.shape

(41368, 41368)

In [26]:
with open('./pickle/cosine_sim_.pickle', 'wb') as handle:
    pickle.dump(cosine_sim, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Cosine Similarity model saved!")

Cosine Similarity model saved!


# Getting recomendations

In [3]:
cosine_sim_file = open('./pickle/cosine_sim_.pickle', 'rb')
cos_sim = pickle.load(cosine_sim_file)
print("Consine Similiarity Model loaded")

Consine Similiarity Model loaded


In [28]:
test_movie = "The Matrix"

Create movie title to index series and save it to pickle

In [4]:
indices = pd.Series(df.index, index=df['title'])
with open('./pickle/movie_indices.pickle', 'wb') as handle:
    pickle.dump(indices, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Indices saved!")

NameError: name 'df' is not defined

Load indices

In [5]:
with open('./pickle/movie_indices.pickle', 'rb') as handle:
    indices = pickle.load(handle)
    print("Indices loaded!")

Indices loaded!


Load processed df

In [10]:
df = pd.read_csv('./movie_lens_dataset/movies_metadata_processed.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [13]:
import pickle
import pandas as pd
def getTfIdfRecommendations(name: str, cosine_sim, movie_indices, df):
    print("Getting recommendations for:", name)

    # Get index of query movie
    movie_index = movie_indices[name]
    
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=['score'])
    
    # Top 10
    top_indices = similarity_scores.sort_values(by="score", ascending=False)[1:11].index
    
    # Get movie title, release date and poster path
    output = df[['title', 'release_date', 'poster_path']].iloc[top_indices]

    output['score'] = similarity_scores.sort_values(by="score", ascending=False)['score'][1:11]

    output.to_csv('./output/tfidf/{}.csv'.format(name))
    
    return output

# Evaluation Set
For the evaluation we will use 20 random movies and calculate the MAP score of the results returned.

In [14]:
getTfIdfRecommendations('The Dark Knight Rises', cos_sim, indices, df)

Getting recommendations for: The Dark Knight Rises


Unnamed: 0,title,release_date,poster_path,score
12039,The Dark Knight,2008-07-16,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,0.317351
1311,Batman Returns,1992-06-19,/jX5THE1yW3zTdeD9dupcIyQvKiG.jpg,0.257601
149,Batman Forever,1995-06-16,/eTMrHEhlFPHNxpqGwpGGTdAa0xV.jpg,0.250565
583,Batman,1989-06-23,/kBf3g9crrADGMc2AMAMlLBgSm2h.jpg,0.228145
14855,Batman: Under the Red Hood,2010-07-27,/78kjgspmLLOm2Glgpzqo9cS4GpI.jpg,0.220578
20038,Batman Unmasked: The Psychology of the Dark Kn...,2008-07-15,/jjHu128XLARc2k4cJrblAvZe0HE.jpg,0.208895
37445,LEGO DC Comics Super Heroes: Batman: Be-Leaguered,2014-10-27,/qD3xyKbCsaaYYDhksWFxxsE1DWq.jpg,0.197668
19168,"Batman: The Dark Knight Returns, Part 2",2013-01-18,/wPeorCnD9MRR2S9Dzh4OpIgNLiv.jpg,0.185338
8964,Batman Beyond: Return of the Joker,2000-12-12,/vIRHN4AXaQM6mMVj8CozhtelYmF.jpg,0.182152
3042,Batman: Mask of the Phantasm,1993-12-25,/l4jaQjkgznu2Rz05X18f24UjPNW.jpg,0.178758


In [22]:
test_movies = ["Harry Potter and the Philosopher's Stone",
"The Matrix",
"The Dark Knight",
"Toy Story",
"The Avengers",
"The Bourne Identity",
"The Devil Wears Prada",
"Mean Girls",
"Sex and the City",
"Mission: Impossible - Ghost Protocol"]

In [25]:
for movie in test_movies:
    print(getTfIdfRecommendations(movie, cos_sim, indices, df))
    print("\n")

Getting recommendations for: Harry Potter and the Philosopher's Stone
                                              title release_date  \
3891                       Harry, He's Here To Help   2000-08-15   
22118                                           Luv   1967-07-26   
10223           Harry Potter and the Goblet of Fire   2005-11-05   
7543       Harry Potter and the Prisoner of Azkaban   2004-05-31   
5576        Harry Potter and the Chamber of Secrets   2002-11-13   
17696                          A Very Potter Sequel   2010-07-22   
11513     Harry Potter and the Order of the Phoenix   2007-06-28   
35781                             Bullet to Beijing   1995-12-20   
4303                                  The Dead Pool   1988-07-12   
16623  Harry Potter and the Deathly Hallows: Part 2   2011-07-07   

                            poster_path     score  
3891   /pU5fsyBZUIFq4cqUB2X0vQD7HZQ.jpg  0.194070  
22118  /tN3Puo62ozXUvmXKrxWYMusY45V.jpg  0.192886  
10223  /6sASqcdrEHXxUhA3n