[Guide](https://www.datacamp.com/community/tutorials/recommender-systems-python)

In [1]:
import pandas as pd

# when using google colab
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
metadata = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/data/movies_metadata.csv', low_memory=False)
# metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


### Simple Recommender

In [3]:
# calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215133889


In [4]:
# calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.9)
print(m)

160.0


In [5]:
# filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [6]:
metadata.shape

(45466, 24)

In [7]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # calculation based on the IMDB formula
    return (v/(v+m)*R) + (m/(m+v)*C)

In [8]:
# define a new feature 'score' and calculate its value with 'weighted_rating()'
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [9]:
# sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


### Content-Based Recommender

In [3]:
# print plot overviews of the first 5 movies
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [4]:
# import TFIDFVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# contruct the required TF-IDF matrix by fitting and transforming the data
# using just first 10000 records due to computation efficiency
tfidf_matrix = tfidf.fit_transform(metadata['overview'][:30000])

# output the shape of tfidf_matrix
tfidf_matrix.shape

(30000, 58562)

In [5]:
# array mapping from feature integer indices to feature names
tfidf.get_feature_names()[5000:5010]

['beartooth',
 'beary',
 'beasley',
 'beast',
 'beastie',
 'beasties',
 'beastly',
 'beastmen',
 'beasts',
 'beat']

In [6]:
# import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
cosine_sim.shape

(30000, 30000)

In [8]:
cosine_sim[1]

array([0.01561351, 1.        , 0.0486754 , ..., 0.        , 0.        ,
       0.02282596])

In [9]:
# construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title'])

In [10]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [21]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
  # get the index of the movie that matches the title
  idx = indices[title]

  # get the pairwise similarity scores of all movies with that movie
  sim_scores = list(enumerate(cosine_sim[idx]))

  # sort the movies based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # get the scores of the 10 most similar movies
  sim_scores = sim_scores[1:11]

  # get the movie indices
  movie_indices = [i[0] for i in sim_scores]

  # return the top 10 most similar movies
  return metadata['title'].iloc[movie_indices]

NameError: ignored

In [12]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [13]:
get_recommendations('The Godfather')

1178      The Godfather: Part II
1914     The Godfather: Part III
23126                 Blood Ties
11297           Household Saints
10821                   Election
17729          Short Sharp Shock
26293         Beck 28 - Familjen
8653                Violent City
13177               I Am the Law
6711                    Mobsters
Name: title, dtype: object

### Credits, Genres, and Keywords Based Recommender

In [3]:
# load keywords and credits
credits = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/data/credits.csv')
keywords = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/data/keywords.csv')

# remove rows with bad IDs
metadata = metadata.drop([19730, 29503, 35587])

# convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# merge keywords and credits into main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [4]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [5]:
# parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
  metadata[feature] = metadata[feature].apply(literal_eval)

In [6]:
# import numpy
import numpy as np

In [7]:
def get_director(x):
  for i in x:
    if i['job'] == 'Director':
      return i['name']
  return np.nan

In [8]:
def get_list(x):
  if isinstance(x, list):
    names = [i['name'] for i in x]
    # check if more than 3 elements exist.
    # if yes, return only first three
    # if no, return entire list
    if len(names) > 3:
      names = names[:3]
    return names
  # return empty list in case of missing/malformedd data
  return []

In [9]:
# define new director, cast, genres and keywords features that are in a suitable form
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
  metadata[feature] = metadata[feature].apply(get_list)

In [10]:
# print the new fewatures of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [11]:
# function to convert all strings to lowercase and strip names of spaces
def clean_data(x):
  if isinstance(x, list):
    return [str.lower(i.replace(' ', '')) for i in x]
  else:
    # check if director exists
    # if not, return empty string
    if isinstance(x, str):
      return str.lower(x.replace(' ', ''))
    else:
      return ''

In [12]:
# apply clean_data function to features
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
  metadata[feature] = metadata[feature].apply(clean_data)

In [13]:
def create_soup(x):
  return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['genres'])

In [14]:
# create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [15]:
metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [16]:
# import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'][:25000])

In [17]:
count_matrix.shape

(25000, 35747)

In [18]:
# compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [19]:
# reset index of main DataFrame and construct reverse mapping as before
metadat = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [22]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim2):
  # get the index of the movie that matches the title
  idx = indices[title]

  # get the pairwise similarity scores of all movies with that movie
  sim_scores = list(enumerate(cosine_sim[idx]))

  # sort the movies based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # get the scores of the 10 most similar movies
  sim_scores = sim_scores[1:11]

  # get the movie indices
  movie_indices = [i[0] for i in sim_scores]

  # return the top 10 most similar movies
  return metadata['title'].iloc[movie_indices]

In [24]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

12589      The Dark Knight
10210        Batman Begins
9311                Shiner
9874       Amongst Friends
7772              Mitchell
516      Romeo Is Bleeding
24090            Quicksand
4520       An Innocent Man
5048        State Property
6095              Lockdown
Name: title, dtype: object

In [25]:
get_recommendations('The Godfather', cosine_sim2)

1934            The Godfather: Part III
8001     The Night of the Following Day
18261                 The Son of No One
7772                           Mitchell
18940                         Last Exit
5309                        The Gambler
11733          The Consequences of Love
16782                     The Organizer
5                                  Heat
426                       Carlito's Way
Name: title, dtype: object