# Content Based Movie Recommender

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import pickle
import math

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jstjo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jstjo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jstjo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Read in dataset

In [3]:
metadata_path = r"./movie_lens_dataset/movies_metadata.csv"

In [4]:
df = pd.read_csv(metadata_path, low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Preprocessing

In [5]:
print("Number of movies:", len(df))

Number of movies: 45466


In [6]:
print("Number of null overviews:", df['overview'].isnull().sum())

Number of null overviews: 954


### Removing rows with null or empty overviews

In [7]:
df = df[df['overview'].notna()]
df = df[df['overview'] != ""]
print("Number of non-null overviews", len(df))

Number of non-null overviews 44512


### Removing punctuations

In [8]:
df['overview'] = df['overview'].str.replace(r'[^\w\s]+', '')
df.head()

  df['overview'] = df['overview'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,Led by Woody Andys toys live happily in his ro...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,Cheated on mistreated and stepped on the women...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Tokenizing words
Don't run this if using sklearn lib

In [73]:
# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# df['overview'] = df['overview'].apply(lambda x: w_tokenizer.tokenize(x))
# df.head()

### Making all words lower case

In [9]:
# df['overview'] = df['overview'].apply(lambda x: [word.lower() for word in x])
df['overview'] = df['overview'].apply(lambda x: " ".join([word.lower() for word in x.split()]))
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led by woody andys toys live happily in his ro...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,when siblings judy and peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,a family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated on mistreated and stepped on the women...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,just when george banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Removing stopwords from overview

In [10]:
stop = stopwords.words ('english')
df['overview'] = df['overview'].apply(lambda x: " ".join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led woody andys toys live happily room andys b...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,siblings judy peter discover enchanted board g...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated mistreated stepped women holding breat...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,george banks recovered daughters wedding recei...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Lemmatization


In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split()])

df['overview'] = df['overview'].apply(lemmatize_text)

df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led woody andys toy live happily room andys bi...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,sibling judy peter discover enchanted board ga...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated mistreated stepped woman holding breat...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,george bank recovered daughter wedding receive...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [12]:
df = df[df['overview'].notna()]
print("Number of non-null overviews", len(df))

Number of non-null overviews 44512


In [13]:
print("Number of null overviews:", df['overview'].isnull().sum())

Number of null overviews: 0


Saving the dataframe to a preprocessed csv

In [14]:
df.to_csv('./movie_lens_dataset/movies_metadata_processed.csv')
print("Processed csv saved!")

Processed csv saved!


# Class to create TF IDF matrix

In [47]:
from tqdm import tqdm

class TFIDF:
    def __init__(self, data: pd.DataFrame) -> None:
        self.total_docs = len(data)
        self.word_set = set()
        self.index_dict = {} # Dictionary to store index for each word
        self.word_count = {} # Dictionary to store the count of the number of documents containing the given word
        self.processing_data(data)
        
    def processing_data(self, data) -> None:
        
        # Creating the word_set
        for _, movie_row in data.iterrows():
            overview = movie_row["overview"]
            for word in overview:
                if word not in self.word_set:
                    self.word_set.add(word)
                    
        # Creating the index_dict
        i = 0
        for word in self.word_set:
            self.index_dict[word] = i
            i += 1
        
        # Creating the word_count
        for word in self.word_set:
            self.word_count[word] = 0
            for _, movie_row in data.iterrows():
                overview = overview = movie_row["overview"]
                if word in overview:
                    self.word_count[word] += 1
    
    def term_frequency(self, word: str, overview: str) -> float:
        return document.count(word) / len(overview)

    def inverse_document_frequency(self, word: str) -> float:
        return 1.0 + math.log(self.total_docs / word_count[word])

    def tf_idf(self, overview: str) -> np.array:
        tf_idf_vec = np.zeros((len(word_set),))
        for word in overview:
            # Add the tf_idf value to the 
            tf_idf_vec[self.index_dict[word]] = self.term_frequency(word, overview) * self.inverse_document_frequency(word)
        return tf_idf_vec

    def generate_tf_idf_matrix(self) -> list:
        tf_idf_matrix = []
                                                                                                                      
        for _, movie_row in tqdm(data.iterrows()):
            vec = self.tf_idf(movie_row['overview'])
            tf_idf_matrix.append(vec)
                                                                                                                    
        return tf_idf_matrix

# Generating TFIDF from dataset and serializing using pickle
Serializing allows us to save time for re-generation of the index.

In [48]:
# tfidf = TFIDF(df)
# tfidf_matrix = tfidf.generate_tf_idf_matrix()
# # Takes too long to run

# tfidf_matrix[:5]

# with open('./pickle/tfidf_matrix.pickle', 'wb') as handle:
#     pickle.dump(tfidf_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
#     print("Index saved!")

KeyboardInterrupt: 

Using sklearn library which runs way faster

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix = tfidf_matrix.astype(np.float32)

with open('./pickle/tfidf_matrix.pickle', 'wb') as handle:
    pickle.dump(tfidf_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Index saved!")

Index saved!


# Creating cosine similarity model

The cosine similarity model is a (44512, 44512) matrix consisting of cosine similarity scores of all movies against each other.

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim.shape

(44512, 44512)

In [36]:
with open('./pickle/cosine_sim_.pickle', 'wb') as handle:
    pickle.dump(cosine_sim, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Cosine Similarity model saved!")

Cosine Similarity model saved!


# Getting recomendations
Using test movie

In [20]:
cosine_sim_file = open('./pickle/cosine_sim_.pickle', 'rb')
cos_sim = pickle.load(cosine_sim_file)
print("Consine Similiarity Model loaded")

Consine Similiarity Model loaded


In [54]:
test_movie = "The Dark Knight"

Create movie title to index series and save it to pickle

In [40]:
indices = pd.Series(df.index, index=df['title'])
with open('./pickle/movie_indices.pickle', 'wb') as handle:
    pickle.dump(indices, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Indices saved!")

Indices saved!


Get the movie index

In [60]:
movie_index = indices[test_movie]
if len(movie_index) > 1:
    movie_index = indices[test_movie][0]

Getting top 10 scores of the movie against all other movies

In [61]:
similarity_scores = pd.DataFrame(cos_sim[movie_index], columns=['score'])

Obtaining the top 10 indices sorted by descending similarity scores

In [62]:
movie_indices = similarity_scores.sort_values(by="score", ascending=False)[:10].index

Getting the list of movie titles

In [63]:
df[['title','imdb_id']].iloc[movie_indices]

Unnamed: 0,title,imdb_id
12515,Leatherheads,tt0379865
566,Foreign Student,tt0109828
4375,Everybody's All-American,tt0095119
25113,Devil's Doorway,tt0042395
35664,See You in Montevideo,tt1801071
5092,All the Right Moves,tt0085154
11209,Invincible,tt0445990
36132,Saturday's Hero,tt0043994
7632,Twice Upon a Time,tt0086489
4388,Johnny Be Good,tt0095409


In [37]:
import pickle
import pandas as pd
def getTfIdfRecommendations(name: str):
    print("Getting recommendations for:", name)
    
    # Load processed dataset
    df = pd.read_csv('./movie_lens_dataset/movies_metadata_processed.csv')
    
    # Load cosine similarity matrix (Might want to load this in server init)
    consine_sim_file = open('./pickle/cosine_sim_.pickle','rb')
    cosine_sim = pickle.load(consine_sim_file)
    
    # Load movie indices
    movie_indices_file = open('./pickle/movie_indices.pickle','rb')
    movie_indices = pickle.load(movie_indices_file)
    
    # Get index of query movie
    movie_index = movie_indices[name]
    
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=['score'])
    
    # Top 10
    top_indices = similarity_scores.sort_values(by="score", ascending=False)[:10].index
    
    # Get movie title and imdb_id
    output = df[['title','imdb_id']].iloc[top_indices]
    
    return output.to_json()

In [38]:
getTfIdfRecommendations('The Dark Knight Rises')

Getting recommendations for: The Dark Knight Rises


'{"title":{"18252":"Elena","30763":"Deadly Daycare","19164":"The One Percent","33909":"Hum Saath Saath Hain","43989":"Once More","44165":"Nicostratos the Pelican","25524":"Drishyam","32542":"White Cannibal Queen","33825":"Maine Pyar Kiya","4113":"The Luzhin Defence"},"imdb_id":{"18252":"tt1925421","30763":"tt3455826","19164":"tt0819791","33909":"tt0216817","43989":"tt0095097","44165":"tt1891942","25524":"tt3417422","32542":"tt0078936","33825":"tt0100095","4113":"tt0211492"}}'

In [16]:
from scipy import sparse

class okapiBM25(object):
    def __init__(self, data, k1 = 1.5, b = 0.75):
        # smooth_idf = True --> add one to the idf to prevent division by zero
        self.vectorizer = TfidfVectorizer(smooth_idf=True)
        self.data = data
        self.k1 = k1
        self.b = b

    def fit(self, data):
        # Fit IDF to documents data
        self.vectorizer.fit(data)
        y = super(TfidfVectorizer, self.vectorizer).transform(data)
        self.avdl = y.sum(1).mean()

    def transform(self, q, data):
        # Calculate BM25 between query q and documents data
        b, k1, avdl = self.b, self.k1, self.avdl

        # Apply CountVectorizer
        data = super(TfidfVectorizer, self.vectorizer).transform(data)
        len_data = data.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # Convert to csc for better column slicing
        data = data.tocsc()[:, q.indices]
        denom = data + (k1 * (1 - b + b * len_data / avdl))[:, None]
        # In sklearn, idf(t) = log[ n / df(t) ] + 1 when smooth_idf = False
        # idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1 when smooth_idf = True
        # Either way we need to subtract 1 from idf to convert idf(t) = log [ n / (df(t) + 1) ]) when smooth_idf = False
        # and to get idf(t) = log [ (1 + n) / (1 + df(t)) ] when smooth_idf = True
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = data.multiply(np.broadcast_to(idf, data.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

# to test bm25, uncomment the following lines
# texts = 'movies_metadata.csv'
# bm25 = BM25()
# bm25.fit(texts[1:])
# print(bm25.transform(texts[0], texts))