In [2]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

In [234]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [7]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader,Dataset,SVD,evaluate

# Simple Recommender

This is a very basic model based on movies popularity and critical acclaimation, this model does not give personalised recommendation

In [310]:
df = pd.read_csv('movies_metadata.csv')
df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [311]:
df['genres']  = df['genres'].fillna('[]').apply(ast.literal_eval)

In [312]:
df['genres'] = df['genres'].apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])

In [313]:
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [314]:
df = df.drop(['homepage','video'],axis=1)

#####  I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

##### Weighted Rating (WR) =  ((v/(v+m)).R)+((m/(v+m)).C)
 

##### where,

##### v is the number of votes for the movie
##### m is the minimum votes required to be listed in the chart
##### R is the average rating of the movie
##### C is the mean vote across the whole report

##### To decide value of m, we will use 90th percentile as our cutoff.

In [315]:
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')

In [316]:
C = vote_averages.mean()
C

5.244896612406511

In [317]:
m = vote_counts.quantile(0.9)
m

160.0

In [318]:
df['release_date'][0]

'1995-10-30'

In [319]:
df['year'] = pd.to_datetime(df['release_date'],errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [320]:
qualified_df = df[(df['vote_count']>m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())]

In [321]:
qualified_df = qualified_df[['title','year','vote_count','vote_average','popularity','genres']]
qualified_df['vote_count'] = qualified_df['vote_count'].astype('int')
qualified_df.head(2)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7.7,21.9469,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6.9,17.0155,"[Adventure, Fantasy, Family]"


In [322]:
qualified_df.shape

(4538, 6)

### there are total 4538 movies that are qualified to be considered for the recommender charts.

##### Lets calculated weighted rating according to mentioned IMDB's formulae

In [323]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m))*R + (m/(v+m)*C)

In [324]:
qualified_df['WR'] = qualified_df.apply(weighted_rating,axis=1)

In [325]:
qualified_df.head(2)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,WR
0,Toy Story,1995,5415,7.7,21.9469,"[Animation, Comedy, Family]",7.62954
1,Jumanji,1995,2413,6.9,17.0155,"[Adventure, Fantasy, Family]",6.797079


In [326]:
imdb_top_250 = qualified_df.sort_values('WR',ascending=False).head(250)
imdb_top_250.head(20)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,WR
314,The Shawshank Redemption,1994,8358,8.5,51.6454,"[Drama, Crime]",8.438857
834,The Godfather,1972,6024,8.5,41.1093,"[Drama, Crime]",8.41578
10309,Dilwale Dulhania Le Jayenge,1995,661,9.1,34.457,"[Comedy, Drama, Romance]",8.348701
12481,The Dark Knight,2008,12269,8.3,123.167,"[Drama, Action, Crime, Thriller]",8.260671
2843,Fight Club,1999,9678,8.3,63.8696,[Drama],8.250313
292,Pulp Fiction,1994,8670,8.3,140.95,"[Thriller, Crime]",8.244641
522,Schindler's List,1993,4436,8.3,41.7251,"[Drama, History, War]",8.193643
23673,Whiplash,2014,4376,8.3,64.3,[Drama],8.192236
5481,Spirited Away,2001,3968,8.3,41.0489,"[Fantasy, Adventure, Animation, Family]",8.181585
2211,Life Is Beautiful,1997,3643,8.3,39.395,"[Comedy, Drama]",8.171466


## Recommendation by genre

In [327]:
def recommendation_by_genre(genre,percentile=0.90):
    
    m = vote_counts.quantile(percentile)
    
    qualified_seg = df[(df['vote_count'] > m) & df['vote_count'].notnull() & df['vote_average'].notnull()] 
    ## This will select the movies with number of votes required, it mostly depend on 'm' as we increase
    ## value of percentile the selected dataframes shrinks
    
    gen_df = pd.DataFrame(columns=qualified_seg.columns)
    # Empty dataset to which we will append the films which belong to given genre
    
    
    for i in range(0,qualified_seg.shape[0]):
        if genre in qualified_seg.iloc[i]['genres']:
            gen_df = pd.concat([gen_df,pd.DataFrame(qualified_seg.iloc[i:i+1])],axis=0)
            
    gen_df = gen_df[['title','year','vote_count','vote_average','popularity','genres','original_language']]
    gen_df['vote_count'] = gen_df['vote_count'].astype('int')
    
    gen_df['WR'] = gen_df.apply(lambda x:(x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                + (m/(m+x['vote_count']) * C), axis=1)
    ### It will calculate the rating according to the IMDB's Formulae
    
    
    gen_df = gen_df.sort_values('WR',ascending=False).head(250)
    ### sort the movies according to Weighted Average score(WR)
    
    
    return gen_df
            
    
    
    

In [328]:
recommendation_by_genre('Mystery',percentile=0.995)   

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,original_language,WR
15480,Inception,2010,14075,8.1,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",en,7.539743
46,Se7en,1995,5915,8.1,18.4574,"[Crime, Mystery, Thriller]",en,7.050855
23675,Gone Girl,2014,6023,7.9,154.801,"[Mystery, Thriller, Drama]",en,6.935487
14825,Shutter Island,2010,6559,7.8,15.8136,"[Drama, Thriller, Mystery]",en,6.921589
4099,Memento,2000,4168,8.1,15.4508,"[Mystery, Thriller]",en,6.809824
11354,The Prestige,2006,4510,8.0,16.9456,"[Drama, Mystery, Thriller]",en,6.808596
11927,Harry Potter and the Order of the Phoenix,2007,5633,7.4,21.3643,"[Adventure, Fantasy, Family, Mystery]",en,6.583455
28131,The Hateful Eight,2015,4405,7.6,20.3288,"[Crime, Drama, Mystery, Western]",en,6.567933
4748,Donnie Darko,2001,3574,7.7,18.3031,"[Fantasy, Drama, Mystery]",en,6.496573
40598,Arrival,2016,5729,7.2,30.83786,"[Thriller, Drama, Science Fiction, Mystery]",en,6.466992


In [195]:
recommendation_by_genre('Romance',percentile=0.995)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,original_language,WR
351,Forrest Gump,1994,8147,8.2,48.3072,"[Comedy, Drama, Romance]",en,7.323352
1639,Titanic,1997,7770,7.5,26.8891,"[Drama, Romance, Thriller]",en,6.808505
40882,La La Land,2016,4745,7.9,19.681686,"[Comedy, Drama, Music, Romance]",en,6.78482
22168,Her,2013,4215,7.9,13.8295,"[Romance, Science Fiction, Drama]",en,6.707571
7208,Eternal Sunshine of the Spotless Mind,2004,3758,7.9,12.9063,"[Science Fiction, Drama, Romance]",en,6.631825
23512,The Fault in Our Stars,2014,3868,7.6,16.2747,"[Romance, Drama]",en,6.492056
2178,Edward Scissorhands,1990,3731,7.5,17.6122,"[Fantasy, Drama, Romance]",en,6.418822
20910,The Great Gatsby,2013,3885,7.3,17.5989,"[Drama, Romance]",en,6.335434
581,Aladdin,1992,3495,7.4,16.3574,"[Animation, Family, Comedy, Adventure, Fantasy...",en,6.331585
19731,Silver Linings Playbook,2012,4840,7.0,14.4881,"[Drama, Comedy, Romance]",en,6.271294


### The above recommender is not personalised and it will only generate recommendation by genre, irrespective of persons own interest

# Content Based Recommender

To personalise our recommendations more, I am going to build an engine that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked. Since we will be using movie metadata (or content) to build this engine, this also known as Content Based Filtering.

I will build 2 content based Recommender based on:

1.Movie Overview and tagline

2.Movie cast,director, genre,keywords

In [329]:
links_small = pd.read_csv('links_small.csv')

In [330]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [331]:
links_small.shape

(9112,)

In [332]:
df =df.drop([19730, 29503, 35587])    # These movies release date is wrongly added as id, so it is wise to remove them

In [333]:
df['id'] = df['id'].astype('int')

In [334]:
smd = df[df['id'].isin(links_small)]

In [335]:
smd.shape

(9099, 23)

So we will work on these 9099 movies

In [336]:
smd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995


In [337]:
smd = smd.drop('status',axis=1)

### Movie description based recommender 
#### Based on overview and tagline

In [338]:
smd['tagline'] = smd['tagline'].fillna('')
smd['overview'] = smd['overview'].fillna('')

In [339]:
smd['description']  = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [340]:
smd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,year,description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...


In [230]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0.0001,stop_words=['english','french'])
tfmatrix = tf.fit_transform(smd['description'])

In [232]:
tfmatrix.shape

(9099, 285340)

### Cosine Similarity

We Will use cosine similarity to calculate numeric quantity that denote similarity between 2 movies.
More info about cosine similarities . https://scikit-learn.org/stable/modules/metrics.html

In [235]:
cosine_sim = cosine_similarity(tfmatrix,tfmatrix)

In [237]:
cosine_sim.shape

(9099, 9099)

In [238]:
cosine_sim[0]

array([1.        , 0.01180905, 0.00434356, ..., 0.00634184, 0.01075257,
       0.00269438])

In [239]:
smd = smd.reset_index()

In [241]:
titles = smd['title']
indices = pd.Series(smd.index,index=smd['title'])

In [260]:
titles.head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [244]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [296]:
def get_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[movies_indices]
    
    

In [427]:
get_recommendation('The Dark Knight')

7931                      The Dark Knight Rises
524                                      Batman
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
6148                           Land of the Dead
5511                            To End All Wars
Name: title, dtype: object

In [297]:
get_recommendation('Jason Bourne')

3985                           Hopscotch
7983    Ghost Rider: Spirit of Vengeance
7897                           Abduction
2833                        Marathon Man
5808                        Police Story
1524                            Repo Man
7974                          Safe House
3810                            Spy Game
5288                          Nightbreed
8734               Men, Women & Children
Name: title, dtype: object

In [298]:
get_recommendation('Nixon')

2207                                Dick
7097                         Frost/Nixon
5574              The Motorcycle Diaries
1276                       Air Force One
3333        The Greatest Story Ever Told
8634    Nixon by Nixon: In His Own Words
7267                 Aliens in the Attic
7811                     Too Big to Fail
2969                            Sunshine
58                    Mr. Holland's Opus
Name: title, dtype: object

# Metadata Based Recommender

It will be based on cast, crew, genre, keywords

In [346]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [347]:
keywords.head(2)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [348]:
credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [349]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] =credits['id'].astype('int')
df['id'] = df['id'].astype('int')

In [350]:
df = df.merge(credits,on='id')
df = df.merge(keywords,on='id')

In [351]:
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [352]:
df.shape

(46628, 26)

In [353]:
smd =df[df['id'].isin(links_small)]

In [354]:
smd.shape

(9219, 26)

Now that we have cast, crew, genre, keywords in one dataset, we can proceed further

from crew we will only choose director, from cast we will choose only top 4 actors

In [355]:
smd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [357]:
smd['cast'] = smd['cast'].apply(ast.literal_eval)
smd['crew'] = smd['crew'].apply(ast.literal_eval)
smd['keywords'] = smd['keywords'].apply(ast.literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [360]:
smd['cast_size'] = smd['cast'].apply(lambda x:len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [361]:
smd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,tagline,title,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,,Toy Story,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",13,106
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",26,16


We will choose only top 4 cast members

In [364]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [366]:
smd['cast'] = smd['cast'].apply(lambda x: x[:4] if len(x)>3 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Now we will get director

In [368]:
smd['crew'][0]

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [369]:
def get_director(crew):
    
    for i in crew:
        
        if i['job'] =='Director':
            return i['name']
        
    else:
        return np.nan

In [371]:
smd['director'] = smd['crew'].apply(get_director)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [372]:
smd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,title,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,Toy Story,7.7,5415.0,1995,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",13,106,John Lasseter
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,Jumanji,6.9,2413.0,1995,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",26,16,Joe Johnston


In [373]:
smd = smd.drop('crew',axis=1)

Now, Keywords

In [374]:
smd['keywords'][0]

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [375]:
smd['keywords'] = smd['keywords'].apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])

In [376]:
smd['keywords'][0]

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

#### Now in cast column there actors such as brad pitt and brad renfro, after we create a soup the computer will consider both brad same  so we need to join first name and last name together

#### We will also give director more weightage, by adding his name 4 times

In [381]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

In [386]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(' ','')))

In [388]:
smd['director'] = smd['director'].apply(lambda x: [x,x,x])

### Now using Snowball stemmer we will stem keywords 

In [391]:
stemmer = SnowballStemmer('english')

In [393]:
stemmer.stem('Availability')

'avail'

In [399]:
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

In [409]:
smd['keywords'] = smd['keywords'].apply(lambda x:[str.lower(i.replace(' ','')) for i in x])

In [415]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']

In [416]:
smd['soup'][1]

['boardgam',
 'disappear',
 "basedonchildren'sbook",
 'newhom',
 'reclus',
 'giantinsect',
 'robinwilliams',
 'jonathanhyde',
 'kirstendunst',
 'bradleypierce',
 'joejohnston',
 'joejohnston',
 'joejohnston',
 'Adventure',
 'Fantasy',
 'Family']

In [417]:
smd['soup'] = smd['soup'].apply(lambda x:' '.join(x))

In [425]:
smd['soup'][3]

'basedonnovel interracialrelationship singlemoth divorc chickflick whitneyhouston angelabassett lorettadevine lelarochon forestwhitaker forestwhitaker forestwhitaker Comedy Drama Romance'

In [419]:
count = CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')

In [420]:
count_matrix = count.fit_transform(smd['soup'])

In [421]:
count_matrix.shape

(9219, 131996)

In [428]:
cosine_sim = cosine_similarity(count_matrix,count_matrix)

In [431]:
smd =  smd.reset_index()
titles = smd['title']
indices = pd.Series(titles.index, index = smd['title'])

As Our Cosine function is changed, lets see what recommendation we get now

In [438]:
get_recommendation('The Dark Knight')

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
5943                      Thursday
Name: title, dtype: object

In [439]:
get_recommendation('Avatar')

974                             Aliens
1011                    The Terminator
522         Terminator 2: Judgment Day
922                          The Abyss
4347    Piranha Part Two: The Spawning
344                          True Lies
8401           Star Trek Into Darkness
1376                           Titanic
8724                 Jupiter Ascending
3216                Dungeons & Dragons
Name: title, dtype: object

# Collaborative Filtering

The above recommendation model has one limitation, It is not user centric, It will only recommend the movies which are close to the given movies.
I will use Surprise library for more user centric recommendations

In [444]:
reader = Reader()

In [441]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [442]:
ratings.shape

(100004, 4)

In [463]:
ratings['movieId'].nunique()

9066

In [445]:
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

In [447]:
data.split(n_folds=5)

In [448]:
svd = SVD()
evaluate(algo=svd,data=data,measures=['RMSE','MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8926
MAE:  0.6878
------------
Fold 2
RMSE: 0.9044
MAE:  0.6941
------------
Fold 3
RMSE: 0.8958
MAE:  0.6894
------------
Fold 4
RMSE: 0.9019
MAE:  0.6963
------------
Fold 5
RMSE: 0.8923
MAE:  0.6890
------------
------------
Mean RMSE: 0.8974
Mean MAE : 0.6913
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8925679605835931,
                             0.9044180003651262,
                             0.8958234629490379,
                             0.9018605831012412,
                             0.8922886169484281],
                            'mae': [0.6877515662859927,
                             0.6941240844812996,
                             0.6894035721154385,
                             0.6962834064535034,
                             0.6890322806612752]})

In [451]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a3dd9f550>

In [452]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [511]:
svd.predict(uid=1,iid=33)

Prediction(uid=1, iid=33, r_ui=None, est=2.6637477868105863, details={'was_impossible': False})

The model predict rating given by userid 1 to movie id 33 is 2.66. This is purely on how other user have given rating to the movie

# Hybrid Recommender

We will collaborate the content based and collaborative filtering recommenders

In [458]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
    

In [480]:
id_map = pd.read_csv('links_small.csv')
id_map.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [481]:
id_map = id_map[['movieId','tmdbId']]

In [482]:
id_map['movieId'].nunique()

9125

In [483]:
id_map.shape

(9125, 2)

In [484]:
id_map.columns= ['movieId','id']

In [485]:
id_map = id_map.merge(smd[['title','id']],on='id').set_index('title')

In [486]:
id_map.head()

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0


In [488]:
indices_map = id_map.set_index('id')

In [489]:
indices_map.head()

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5


In [491]:
indices.head(3)

title
Toy Story           0
Jumanji             1
Grumpier Old Men    2
dtype: int64

In [492]:
titles.head(3)

0           Toy Story
1             Jumanji
2    Grumpier Old Men
Name: title, dtype: object

In [514]:
def hybrid_recommender(userId,title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [x[0] for x in sim_scores]
    movies = smd.loc[movie_indices][['title','vote_count','vote_average','year','id']]
    movies['est'] = movies['id'].apply(lambda x : svd.predict(userId,indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est',ascending=False)
    
    return movies.head(10)

In [523]:
hybrid_recommender(1,'Casino')

Unnamed: 0,title,vote_count,vote_average,year,id,est
994,The Godfather: Part II,3418.0,8.3,1974,240,3.701382
1001,Raging Bull,968.0,7.7,1980,1578,3.585628
4480,City of God,1852.0,8.2,2002,598,3.445118
8544,The Wolf of Wall Street,6768.0,7.9,2013,106646,3.432624
6599,The Departed,4455.0,7.9,2006,1422,3.344965
101,Taxi Driver,2632.0,8.1,1976,103,3.328656
986,GoodFellas,3211.0,8.2,1990,769,3.295456
1000,Once Upon a Time in America,1104.0,8.3,1984,311,3.249647
2441,Drugstore Cowboy,117.0,7.0,1989,476,3.066155
4465,The King of Comedy,306.0,7.6,1982,262,3.018057


In [536]:
hybrid_recommender(4,'The Martian')

Unnamed: 0,title,vote_count,vote_average,year,id,est
2876,Gladiator,5566.0,7.9,2000,98,4.882683
7284,Moon,1831.0,7.6,2009,17431,4.877088
8712,Guardians of the Galaxy,10014.0,7.9,2014,118340,4.776594
8613,Interstellar,11187.0,8.1,2014,157336,4.731971
987,Alien,4564.0,7.9,1979,348,4.688327
8868,Avengers: Age of Ultron,6908.0,7.3,2015,99861,4.620817
485,Blade Runner,3833.0,7.9,1982,78,4.533355
7208,Replicant,93.0,5.0,2001,10596,4.509652
4812,Matchstick Men,515.0,6.9,2003,7270,4.496293
2741,Thelma & Louise,766.0,7.2,1991,1541,4.474612


In [529]:
smd.sort_values('vote_count',ascending=False)['title']

7648                                            Inception
6981                                      The Dark Knight
7488                                               Avatar
7969                                         The Avengers
8871                                             Deadpool
8613                                         Interstellar
8310                                     Django Unchained
8712                              Guardians of the Galaxy
2390                                           Fight Club
8029                                     The Hunger Games
8864                                   Mad Max: Fury Road
8031                                The Dark Knight Rises
2079                                           The Matrix
7009                                             Iron Man
8392                                           Iron Man 3
3899    The Lord of the Rings: The Fellowship of the Ring
8819                                       Jurassic World
266           

In [537]:
import surprise