# Content-based Recommender System

### based on similarity of movie plots

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel # for dot product

In [2]:
pd.set_option('display.max_columns', 25)
df = pd.read_csv('movies_metadata.csv', dtype={'overview':str, 'popularity': str})
movies = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count', 'overview']]

In [3]:
movies.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,overview
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,Just when George Banks has recovered from his ...


In [4]:
# randomly sampling to overcome memory issues later on
movies = movies.sample(frac=0.7, random_state=123).reset_index()

### Cleaning (as before)

In [5]:
# convert release date to pandas datetime format
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

# extract year from datetime
movies['year'] = movies['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

# function to convert 'year' into integers and NaT to 0
def convert_int(x):
    try:
        return int(x)
    except:
        return 0
    
# Apply function to year
movies['year'] = movies['year'].apply(convert_int)
movies = movies.drop('release_date', axis=1)

In [6]:
movies.head()

Unnamed: 0,index,title,genres,runtime,vote_average,vote_count,overview,year
0,25149,Elvis Has Left the Building,"[{'id': 35, 'name': 'Comedy'}]",90.0,5.8,11.0,Harmony had an encounter early in life with th...,2004
1,12025,Face to Face,"[{'id': 37, 'name': 'Western'}]",112.0,7.1,12.0,History Professor Brad Fletcher heads west for...,1967
2,22462,Modern Boy,"[{'id': 18, 'name': 'Drama'}]",121.0,0.0,0.0,LEE Hae-myung is proud to be the most modern b...,2008
3,26045,The Tattooist,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",92.0,5.1,20.0,American tattoo artist Jake Sawyer wanders the...,2007
4,13467,Marius,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",130.0,6.6,9.0,"César runs a bar along Marseilles' port, assis...",1931


In [7]:
# convert stringed json to list (basically remove quotes)
from ast import literal_eval
movies['genres'] = movies['genres'].fillna('[]')
movies['genres'] = movies['genres'].apply(literal_eval)
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [8]:
movies.head()

Unnamed: 0,index,title,genres,runtime,vote_average,vote_count,overview,year
0,25149,Elvis Has Left the Building,[Comedy],90.0,5.8,11.0,Harmony had an encounter early in life with th...,2004
1,12025,Face to Face,[Western],112.0,7.1,12.0,History Professor Brad Fletcher heads west for...,1967
2,22462,Modern Boy,[Drama],121.0,0.0,0.0,LEE Hae-myung is proud to be the most modern b...,2008
3,26045,The Tattooist,"[Thriller, Horror]",92.0,5.1,20.0,American tattoo artist Jake Sawyer wanders the...,2007
4,13467,Marius,"[Comedy, Drama]",130.0,6.6,9.0,"César runs a bar along Marseilles' port, assis...",1931


### Vectorizing

In [9]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english', lowercase=True)

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(31826, 63574)

In [10]:
# Compute the cosine similarity matrix
# liner kernel (X,Y) = (X Transpose) * Y
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

### Building recommender

In [11]:
# reverse mapping of movie title with index for easy retrieval
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [12]:
# The function
def movie_recommender(title, cosine_sim=cosine_sim, movies=movies, indices=indices):
    # get the index of the title
    ind = indices[title]
    
    # pairwise scores with all movies
    sim_scores = list(enumerate(cosine_sim[ind]))
    #print(sim_scores)
    
    # sort the similarities in descending order
    sim_scores = sorted(sim_scores, key=lambda tup: tup[1], reverse=True)
    
    # Pick the first 10 similar movies
    sim_scores = sim_scores[1:11]
    
    # get the movie indices
    similar_movies_ind = [x[0] for x in sim_scores]
    
    # return the movie titles
    return movies['title'].iloc[similar_movies_ind]

In [13]:
movie_recommender('Se7en')

23109                Kalifornia
4872                   The Cell
23436     While the City Sleeps
9404              Mad Detective
22635        Traffic Department
23435                The Cell 2
17771    The Poughkeepsie Tapes
6125                   Ricochet
17421          Seven and a Half
7383         The Bone Collector
Name: title, dtype: object

In [14]:
movie_recommender('Jurassic Park III')

7102                                Perfect Strangers
29688                                           Gonin
15736                                  The Wrong Girl
29353    Before The Dinosaurs - Walking With Monsters
20888                                   The 3rd Voice
16288                           Dysfunctional Friends
23106                                   The Discovery
17542                                           Wacko
30378                  We're Back! A Dinosaur's Story
28117                                Scattered Clouds
Name: title, dtype: object

In [15]:
movie_recommender('Captain America: Civil War')

8321     Iron Man & Captain America: Heroes United
22011                                    Team Thor
31035          Captain America: The Winter Soldier
2724                                    Ironmaster
13575                      Avengers: Age of Ultron
30852          Marvel: 75 Years, From Pulp to Pop!
8715                             Buckskin Frontier
17351                          P-51 Dragon Fighter
7514                                         U-571
29920                              The Borderlands
Name: title, dtype: object