In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [3]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader,Dataset,SVD,evaluate

In [14]:
from IPython.display import HTML,Image
pd.set_option('display.max_colwidth', 100)

## Content
1. __Simple Recommender__ :<br>
`1.1 IMDB Top 250`: Top 250 Movies based on calculated IMDB ratings<br> 
`1.2 Recommendation by Genre`: Top Movies for every genre in database
<br>
***
2. __Content Based Recommender__:<br>
`2.1 Movie Description Based:` Recommend movies based on overview and tagline<br>
`2.2 Metadata based`: Recommend movies based on Cast, Director and keywords of movie
***
3. __Collaborative Filtering using Surprise__:<br>
The personalised recommender based on users Past history of ratings and similarity between his and other users rating history
***
4. __Hybrid Recommender__:<br>
Recommeder system based on collaberation of Contend based recommender and collaberative filtering, It leverages are features of both recommender for better recommendations

#### Loadind the data

In [5]:
df = pd.read_csv('movies_metadata.csv')
df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [7]:
df = df.drop(['homepage','video'],axis=1)

In [11]:
base_poster_url = 'http://image.tmdb.org/t/p/w185/'
df['poster_path'] = "<img src='" + base_poster_url + df['poster_path'] + "' style='height:100px;'>"

In [8]:
df['genres']  = df['genres'].fillna('[]').apply(ast.literal_eval)
df['genres'] = df['genres'].apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])

# 1. Simple Recommender

This is a very basic model based on movies popularity and critical acclaimation, this model does not give personalised recommendation

#####  I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

##### Weighted Rating (WR) =  ((v/(v+m)).R)+((m/(v+m)).C)
 

##### where,

##### v is the number of votes for the movie
##### m is the minimum votes required to be listed in the chart
##### R is the average rating of the movie
##### C is the mean vote across the whole report

In [9]:
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.9)

In [10]:
df['year'] = pd.to_datetime(df['release_date'],errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [15]:
qualified_df = df[(df['vote_count']>m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())]
qualified_df = qualified_df[['poster_path','title','year','vote_count','vote_average','popularity','genres']]
qualified_df['vote_count'] = qualified_df['vote_count'].astype('int')
qualified_df.head(2)
HTML(qualified_df.head(2).to_html(escape=False))

Unnamed: 0,poster_path,title,year,vote_count,vote_average,popularity,genres
0,,Toy Story,1995,5415,7.7,21.9469,"[Animation, Comedy, Family]"
1,,Jumanji,1995,2413,6.9,17.0155,"[Adventure, Fantasy, Family]"


In [16]:
qualified_df.shape

(4538, 7)

### there are total 4538 movies that are qualified to be considered for the recommender charts.

##### Lets calculated weighted rating according to mentioned IMDB's formulae

In [17]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m))*R + (m/(v+m)*C)

In [19]:
qualified_df['WR'] = qualified_df.apply(weighted_rating,axis=1)
qualified_df.head(2)
HTML(qualified_df.head(2).to_html(escape=False))

Unnamed: 0,poster_path,title,year,vote_count,vote_average,popularity,genres,WR
0,,Toy Story,1995,5415,7.7,21.9469,"[Animation, Comedy, Family]",7.62954
1,,Jumanji,1995,2413,6.9,17.0155,"[Adventure, Fantasy, Family]",6.797079


## 1.1 IMDB TOP 250

In [20]:
imdb_top_250 = qualified_df.sort_values('WR',ascending=False).head(250)
imdb_top_250.head(20)
HTML(imdb_top_250.head(15).to_html(escape=False))

Unnamed: 0,poster_path,title,year,vote_count,vote_average,popularity,genres,WR
314,,The Shawshank Redemption,1994,8358,8.5,51.6454,"[Drama, Crime]",8.438857
834,,The Godfather,1972,6024,8.5,41.1093,"[Drama, Crime]",8.41578
10309,,Dilwale Dulhania Le Jayenge,1995,661,9.1,34.457,"[Comedy, Drama, Romance]",8.348701
12481,,The Dark Knight,2008,12269,8.3,123.167,"[Drama, Action, Crime, Thriller]",8.260671
2843,,Fight Club,1999,9678,8.3,63.8696,[Drama],8.250313
292,,Pulp Fiction,1994,8670,8.3,140.95,"[Thriller, Crime]",8.244641
522,,Schindler's List,1993,4436,8.3,41.7251,"[Drama, History, War]",8.193643
23673,,Whiplash,2014,4376,8.3,64.3,[Drama],8.192236
5481,,Spirited Away,2001,3968,8.3,41.0489,"[Fantasy, Adventure, Animation, Family]",8.181585
2211,,Life Is Beautiful,1997,3643,8.3,39.395,"[Comedy, Drama]",8.171466


## 1.2 Recommendation by genre

In [21]:
def recommendation_by_genre(genre,percentile=0.90):
    
    m = vote_counts.quantile(percentile)
    
    qualified_seg = df[(df['vote_count'] > m) & df['vote_count'].notnull() & df['vote_average'].notnull()] 
    ## This will select the movies with number of votes required, it mostly depend on 'm' as we increase
    ## value of percentile the selected dataframes shrinks
    
    gen_df = pd.DataFrame(columns=qualified_seg.columns)
    # Empty dataset to which we will append the films which belong to given genre
    
    
    for i in range(0,qualified_seg.shape[0]):
        if genre in qualified_seg.iloc[i]['genres']:
            gen_df = pd.concat([gen_df,pd.DataFrame(qualified_seg.iloc[i:i+1])],axis=0)
            
    gen_df = gen_df[['poster_path','title','year','vote_count','vote_average','popularity','genres','original_language']]
    gen_df['vote_count'] = gen_df['vote_count'].astype('int')
    
    gen_df['WR'] = gen_df.apply(lambda x:(x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                + (m/(m+x['vote_count']) * C), axis=1)
    ### It will calculate the rating according to the IMDB's Formulae
    
    
    gen_df = gen_df.sort_values('WR',ascending=False).head(250)
    ### sort the movies according to Weighted Average score(WR)
    
    
    return gen_df

In [23]:
recommendation_by_genre('Mystery',percentile=0.95)   
HTML(recommendation_by_genre('Mystery',percentile=0.95).head(10).to_html(escape=False)   )

Unnamed: 0,poster_path,title,year,vote_count,vote_average,popularity,genres,original_language,WR
15480,,Inception,2010,14075,8.1,29.1081,"[Action, Thriller, Science Fiction, Mystery, Adventure]",en,8.014597
46,,Se7en,1995,5915,8.1,18.4574,"[Crime, Mystery, Thriller]",en,7.904833
4099,,Memento,2000,4168,8.1,15.4508,"[Mystery, Thriller]",en,7.830744
11354,,The Prestige,2006,4510,8.0,16.9456,"[Drama, Mystery, Thriller]",en,7.758148
23675,,Gone Girl,2014,6023,7.9,154.801,"[Mystery, Thriller, Drama]",en,7.72154
14825,,Shutter Island,2010,6559,7.8,15.8136,"[Drama, Thriller, Mystery]",en,7.641425
897,,2001: A Space Odyssey,1968,3075,7.9,22.4946,"[Science Fiction, Mystery, Adventure]",en,7.571612
877,,Rear Window,1954,1531,8.2,17.9113,"[Drama, Mystery, Thriller]",en,7.547321
9430,,Oldboy,2003,2000,8.0,10.6169,"[Drama, Thriller, Mystery, Action]",ko,7.508745
4748,,Donnie Darko,2001,3574,7.7,18.3031,"[Fantasy, Drama, Mystery]",en,7.434153


#### As we move Percentile value, the movies to qualify for recomendation get selected or dropped, Higher the percentile, higher number of votes required to qualify

In [24]:
recommendation_by_genre('Romance',percentile=0.995)
HTML(recommendation_by_genre('Romance',percentile=0.995).head(10).to_html(escape=False))

Unnamed: 0,poster_path,title,year,vote_count,vote_average,popularity,genres,original_language,WR
351,,Forrest Gump,1994,8147,8.2,48.3072,"[Comedy, Drama, Romance]",en,7.323352
1639,,Titanic,1997,7770,7.5,26.8891,"[Drama, Romance, Thriller]",en,6.808505
40882,,La La Land,2016,4745,7.9,19.681686,"[Comedy, Drama, Music, Romance]",en,6.78482
22168,,Her,2013,4215,7.9,13.8295,"[Romance, Science Fiction, Drama]",en,6.707571
7208,,Eternal Sunshine of the Spotless Mind,2004,3758,7.9,12.9063,"[Science Fiction, Drama, Romance]",en,6.631825
23512,,The Fault in Our Stars,2014,3868,7.6,16.2747,"[Romance, Drama]",en,6.492056
2178,,Edward Scissorhands,1990,3731,7.5,17.6122,"[Fantasy, Drama, Romance]",en,6.418822
20910,,The Great Gatsby,2013,3885,7.3,17.5989,"[Drama, Romance]",en,6.335434
581,,Aladdin,1992,3495,7.4,16.3574,"[Animation, Family, Comedy, Adventure, Fantasy, Romance]",en,6.331585
19731,,Silver Linings Playbook,2012,4840,7.0,14.4881,"[Drama, Comedy, Romance]",en,6.271294


#### Well these are quite good romantic movies, but Before Sunset is still my favourite Romantic movie, whatever this list says

### The above recommender is not personalised and it will only generate recommendation by genre, irrespective of persons own interest

# 2. Content Based Recommender

To personalise our recommendations more, I am going to build an engine that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked. Since we will be using movie metadata (or content) to build this engine, this also known as Content Based Filtering.

Even though content based recommender is still not personalised recommender, we gonna use it letter for hybrid recommender 

I will build 2 content based Recommender based on:

1.Movie Overview and tagline

2.Movie cast,director, genre,keywords

For upcoming recommenders, we gonna need sparse matrix formed using Cosine similarities/linear similarities, so for computation conveninece we will subset of dataset

In [26]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
df =df.drop([19730, 29503, 35587])    # These movies release date is wrongly added as id, so it is wise to remove them
df['id'] = df['id'].astype('int')
smd = df[df['id'].isin(links_small)]
smd.shape

(9099, 23)

So we will work with 9099 movies

## 2.1 Movie description based recommender 
#### Based on overview and tagline

In [27]:
smd['tagline'] = smd['tagline'].fillna('')
smd['overview'] = smd['overview'].fillna('')
smd['description']  = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')
smd.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year,description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...",21.9469,...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...,17.0155,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...


In [28]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0.0001,stop_words=['english','french'])
tfmatrix = tf.fit_transform(smd['description'])

In [29]:
tfmatrix.shape

(9099, 285340)

### Cosine Similarity

We Will use cosine similarity to calculate numeric quantity that denote similarity between 2 movies.
More info about cosine similarities . https://scikit-learn.org/stable/modules/metrics.html

In [31]:
cosine_sim = cosine_similarity(tfmatrix,tfmatrix)
cosine_sim.shape

(9099, 9099)

In [32]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index,index=smd['title'])

In [33]:
titles.head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [34]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [39]:
def get_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    
    return smd.iloc[movies_indices][['poster_path','title','year','description']]

In [41]:
get_recommendation('The Dark Knight')
HTML(get_recommendation('The Dark Knight').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
7931,,The Dark Knight Rises,2012,"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c..."
524,,Batman,1989,The Dark Knight of Gotham City begins his war on crime with his first major enemy being the clow...
132,,Batman Forever,1995,The Dark Knight of Gotham City confronts a dastardly duo: Two-Face and the Riddler. Formerly Dis...
1113,,Batman Returns,1992,"Having defeated the Joker, Batman now faces the Penguin - a warped and deformed individual who i..."
8227,,"Batman: The Dark Knight Returns, Part 2",2013,Batman has stopped the reign of terror that The Mutants had cast upon his city. Now an old foe ...
7565,,Batman: Under the Red Hood,2010,Batman faces his ultimate challenge as the mysterious Red Hood takes Gotham City by firestorm. O...
7901,,Batman: Year One,2011,Two men come to Gotham City: Bruce Wayne after years abroad feeding his lifelong obsession for j...
2579,,Batman: Mask of the Phantasm,1993,"An old flame of Bruce Wayne's strolls into town, re-heating up the romance between the two. At t..."
6148,,Land of the Dead,2005,The world is full of zombies and the survivors have barricaded themselves inside a walled city t...
5511,,To End All Wars,2001,"Based on a real-life story, this drama focuses on a small group of Allied soldiers in Burma who ..."


As we can see the recommeder is recommending the movie whose plotline is similar to 'The Dark Knight', which is no surprise that it is showing most of the batman movies

In [43]:
get_recommendation('Before Sunset')
HTML(get_recommendation('Before Sunset').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
8218,,Judas Kiss,2011,A quirk in time and space gives a failed filmmaker the chance to reshape his destiny when he vis...
7530,,Letters to Juliet,2010,"An American girl on vacation in Italy finds an unanswered ""letter to Juliet"" -- one of thousands..."
3220,,The Family Man,2000,"Jack's lavish, fast-paced lifestyle changes one Christmas night when he stumbles into a grocery ..."
3764,,Serendipity,2001,"Although strangers Sara and Jonathan are both already in relationships, they realize they have g..."
8255,,Before Midnight,2013,We meet Jesse and Celine nine years on in Greece. Almost two decades have passed since their fir...
3187,,Bounce,2000,A man switches plane tickets with another man who dies in that plane in a crash. The man falls i...
8577,,22 Jump Street,2014,"After making their way through high school (twice), big changes are in store for officers Schmid..."
6089,,Look at Me,2004,"This is the story of human beings who know exactly what they'd do if they were somebody else, bu..."
2824,,Passion of Mind,2000,"When Marie, a widow in Provence with two daughters, locks her bedroom door and goes to sleep, sh..."
6876,,"Definitely, Maybe",2008,"When Will decides to tell his daughter the story of how he met her mother, he discovers that a s..."


In [47]:
get_recommendation('Avatar')
HTML(get_recommendation('Avatar').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
2059,,The Matrix,1999,"Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of un..."
4506,,Tears of the Sun,2003,Navy SEAL Lieutenant A.K. Waters and his elite squadron of tactical specialists are forced to ch...
975,,A Grand Day Out,1990,Wallace and Gromit have run out of cheese and this provides an excellent excuse for the animated...
538,,Hellraiser: Bloodline,1996,"In the 22nd century, a scientist attempts to right the wrong his ancestor created: the puzzle bo..."
2910,,Pandora and the Flying Dutchman,1951,Albert Lewin's interpretation of the legend of the Flying Dutchman. In a little spanish seaport ...
3360,,The Dish,2000,The true story of how the Parkes Radio Telescope was used to relay the live television of man's ...
5865,,The Defender,1994,A corrupt businessman commits a murder and the only witness is the girlfriend of another busines...
7460,,Green Zone,2010,"During the U.S.-led occupation of Baghdad in 2003, Chief Warrant Officer Roy Miller and his team..."
7587,,The American,2010,"Dispatched to a small Italian town to await further orders, assassin Jack embarks on a double li..."
6105,,A Trip to the Moon,1902,A Trip to The Moon is a science fiction film from the French film pioneer Georges Méliès from th...


In [49]:
get_recommendation('Nixon')
HTML(get_recommendation('Nixon').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
2207,,Dick,1999,Comedy about two high school girls who wander off during a class trip to the White House and mee...
7097,,Frost/Nixon,2008,"For three years after being forced from office, Nixon remained silent. But in summer 1977, the s..."
5574,,The Motorcycle Diaries,2004,"""The Motorcycle Diaries"" is based on the journals of Che Guevara, leader of the Cuban Revolution..."
1276,,Air Force One,1997,Russian terrorists conspire to hijack the aircraft with the president and his family on board. T...
3333,,The Greatest Story Ever Told,1965,All-star epic retelling of Christ's life.
8634,,Nixon by Nixon: In His Own Words,2014,"From 1971 to 1973, Richard Nixon secretly recorded his private conversations in the White House...."
7267,,Aliens in the Attic,2009,"It's summer vacation, but the Pearson family kids are stuck at a boring lake house with their ne..."
7811,,Too Big to Fail,2011,"Based on the bestselling book by Andrew Ross Sorkin, 'Too Big to Fail' offers an intimate look a..."
2969,,Sunshine,1999,The fate of a Hungarian Jewish family throughout the 20th century.
58,,Mr. Holland's Opus,1995,"In 1965, passionate musician Glenn Holland takes a day job as a high school music teacher, convi..."


## 2.2 Metadata Based Recommender

It will be based on cast, crew, genre, keywords

In [50]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [51]:
keywords.head(2)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam..."


In [52]:
credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender'...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, '...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a36847f80a7c73', 'gender': ...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', 'gender': 2, 'id': 511, '...",8844


In [53]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] =credits['id'].astype('int')
df['id'] = df['id'].astype('int')
df = df.merge(credits,on='id')
df = df.merge(keywords,on='id')

In [93]:
smd =df[df['id'].isin(links_small)]

In [94]:
smd.shape

(9219, 26)

In [95]:
smd.head(2)         # Check the cast and crew column

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',...",30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...",21.9469,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender'...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...,17.0155,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a36847f80a7c73', 'gender': ...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', 'gender': 2, 'id': 511, '...","[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam..."


In [96]:
smd['cast'] = smd['cast'].apply(ast.literal_eval)
smd['crew'] = smd['crew'].apply(ast.literal_eval)
smd['keywords'] = smd['keywords'].apply(ast.literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In Cast column, there are total 10-15 actors are present for each movie, but we need only top 3-4 main actors who have worked in movie

In [97]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x])
smd['cast'] = smd['cast'].apply(lambda x: x[:4] if len(x)>3 else x) # only top 4 actors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Now we will get director

In [98]:
def get_director(crew):
    
    for i in crew:
        
        if i['job'] =='Director':
            return i['name']
        
    else:
        return np.nan

In [99]:
smd['director'] = smd['crew'].apply(get_director)
smd = smd.drop('crew',axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [100]:
smd['keywords'] = smd['keywords'].apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])

#### Now in cast column there actors such as brad pitt and brad renfro, after we create a soup the computer will consider both brad same  so we need to join first name and last name together

In [101]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(' ','')))
smd['director'] = smd['director'].apply(lambda x: [x,x])  # We are adding directors name 3 times to give more weightage

### Now using Snowball stemmer we will stem keywords 

In [102]:
stemmer = SnowballStemmer('english')

In [103]:
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

In [104]:
smd['keywords'] = smd['keywords'].apply(lambda x:[str.lower(i.replace(' ','')) for i in x])

In [105]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']

In [106]:
smd['soup'] = smd['soup'].apply(lambda x:' '.join(x))

In [107]:
count = CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [108]:
count_matrix.shape

(9219, 132041)

In [109]:
cosine_sim = cosine_similarity(count_matrix,count_matrix)

In [110]:
smd =  smd.reset_index()
titles = smd['title']
indices = pd.Series(titles.index, index = smd['title'])

In [111]:
def get_recommendation_2(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    j= smd.iloc[movies_indices][['poster_path','title','year','cast','director']]
    j['director'] = j['director'].apply(lambda x:x[0])
    return j

As Our Cosine function is changed, lets see what recommendation we get now

In [112]:
get_recommendation_2('The Dark Knight')
HTML(get_recommendation_2('The Dark Knight').to_html(escape=False))

Unnamed: 0,poster_path,title,year,cast,director
8031,,The Dark Knight Rises,2012,"[christianbale, michaelcaine, garyoldman, annehathaway]",christophernolan
6218,,Batman Begins,2005,"[christianbale, michaelcaine, liamneeson, katieholmes]",christophernolan
6623,,The Prestige,2006,"[hughjackman, christianbale, michaelcaine, scarlettjohansson]",christophernolan
7659,,Batman: Under the Red Hood,2010,"[brucegreenwood, jensenackles, neilpatrickharris, jasonisaacs]",brandonvietti
5943,,Thursday,1998,"[thomasjane, paulamarshall, aaroneckhart, jameslegros]",skipwoods
8927,,Kidnapping Mr. Heineken,2015,"[anthonyhopkins, jimsturgess, samworthington, ryankwanten]",danielalfredson
1134,,Batman Returns,1992,"[michaelkeaton, dannydevito, michellepfeiffer, christopherwalken]",timburton
1260,,Batman & Robin,1997,"[georgeclooney, chriso'donnell, arnoldschwarzenegger, umathurman]",joelschumacher
2085,,Following,1998,"[jeremytheobald, alexhaw, lucyrussell, johnnolan]",christophernolan
9162,,London Has Fallen,2016,"[gerardbutler, aaroneckhart, morganfreeman, angelabassett]",babaknajafi


Now Most of the movies recommended by recommender  for dark knight are dominated by christian bale, and christopher nolan

In [113]:
get_recommendation_2('Avatar')
HTML(get_recommendation_2('Avatar').to_html(escape=False))

Unnamed: 0,poster_path,title,year,cast,director
8401,,Star Trek Into Darkness,2013,"[chrispine, zacharyquinto, zoesaldana, karlurban]",j.j.abrams
974,,Aliens,1986,"[sigourneyweaver, michaelbiehn, jamesremar, paulreiser]",jamescameron
8724,,Jupiter Ascending,2015,"[milakunis, channingtatum, seanbean, eddieredmayne]",lillywachowski
3216,,Dungeons & Dragons,2000,"[justinwhalin, jeremyirons, thorabirch, brucepayne]",courtneysolomon
1011,,The Terminator,1984,"[arnoldschwarzenegger, michaelbiehn, lindahamilton, paulwinfield]",jamescameron
3060,,Sinbad and the Eye of the Tiger,1977,"[patrickwayne, tarynpower, margaretwhiting, janeseymour]",samwanamaker
4966,,Hercules in New York,1969,"[arnoldschwarzenegger, deborahloomis, tainaelg, jameskaren]",arthurallanseidelman
7265,,Dragonball Evolution,2009,"[chowyun-fat, justinchatwin, joonpark, jamiechung]",jameswong
1668,,Return from Witch Mountain,1978,"[bettedavis, christopherlee, kimrichards, ikeeisenmann]",johnhough
4017,,Hawk the Slayer,1980,"[jackpalance, johnterry, bernardbresslaw, raycharleson]",terrymarcel


In [114]:
get_recommendation_2('Before Sunset')
HTML(get_recommendation_2('Before Sunset').to_html(escape=False))

Unnamed: 0,poster_path,title,year,cast,director
189,,Before Sunrise,1995,"[ethanhawke, juliedelpy, andreaeckert, hannopöschl]",richardlinklater
8362,,Before Midnight,2013,"[ethanhawke, juliedelpy, seamusdavey-fitzpatrick, jenniferprior]",richardlinklater
3825,,Waking Life,2001,"[wileywiggins, ethanhawke, juliedelpy, kenwebster]",richardlinklater
1193,,SubUrbia,1996,"[jaycebartok, amiecarey, nickykatt, ajaynaidu]",richardlinklater
1422,,The Newton Boys,1998,"[matthewmcconaughey, skeetulrich, ethanhawke, vincentd'onofrio]",richardlinklater
8686,,Boyhood,2014,"[ellarcoltrane, patriciaarquette, ethanhawke, elijahsmith]",richardlinklater
7227,,Two Lovers,2008,"[joaquinphoenix, gwynethpaltrow, isabellarossellini, vinessashaw]",jamesgray
3255,,An Officer and a Gentleman,1982,"[richardgere, debrawinger, louisgossett,jr., davidkeith]",taylorhackford
4354,,Frida,2002,"[salmahayek, alfredmolina, míamaestro, patriciareyesspíndola]",julietaymor
3887,,Iris,2001,"[judidench, jimbroadbent, katewinslet, hughbonneville]",richardeyre


# 3. Collaborative Filtering

The above recommendation models has one limitation, It is not user centric, It will only recommend the movies which are close to the given movies.
I will use Surprise library for more user centric recommendations

In [115]:
reader = Reader()

In [116]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [117]:
ratings['movieId'].nunique()

9066

In [118]:
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
data.split(n_folds=5)
svd = SVD()
evaluate(algo=svd,data=data,measures=['RMSE','MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8976
MAE:  0.6897
------------
Fold 2
RMSE: 0.8915
MAE:  0.6879
------------
Fold 3
RMSE: 0.8993
MAE:  0.6939
------------
Fold 4
RMSE: 0.9022
MAE:  0.6942
------------
Fold 5
RMSE: 0.8959
MAE:  0.6900
------------
------------
Mean RMSE: 0.8973
Mean MAE : 0.6911
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8976340079189299,
                             0.8914695394582584,
                             0.8993006431621312,
                             0.9021528138098897,
                             0.895931173341792],
                            'mae': [0.689713853205912,
                             0.6878668373981629,
                             0.6939102476430006,
                             0.6942286979321564,
                             0.6900248249026474]})

In [119]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a19bdad68>

In [122]:
ratings[ratings['userId']==10][:10]      # These are first 10 ratings given by user with userid10

Unnamed: 0,userId,movieId,rating,timestamp
744,10,50,5.0,942766420
745,10,152,4.0,942766793
746,10,318,4.0,942766515
747,10,344,3.0,942766603
748,10,345,4.0,942766603
749,10,592,3.0,942767328
750,10,735,4.0,942766974
751,10,1036,3.0,942767258
752,10,1089,3.0,942766420
753,10,1101,2.0,942767328


In [124]:
svd.predict(uid=10,iid=33)

Prediction(uid=10, iid=33, r_ui=None, est=3.565545318581256, details={'was_impossible': False})

The model predict rating given by userid 10 to movie id 33 is 3.56. This is purely on how other user have given rating to the movie and how their ratings are similar to userid10

# 4. Hybrid Recommender

We will collaborate the content based(Metadata based) and collaborative filtering recommenders

In [125]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [126]:
id_map = pd.read_csv('links_small.csv')
id_map.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [127]:
id_map = id_map[['movieId','tmdbId']]
id_map['movieId'].nunique()

9125

In [128]:
id_map.columns= ['movieId','id']
id_map = id_map.merge(smd[['title','id']],on='id').set_index('title')
id_map.head()

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0


In [129]:
indices_map = id_map.set_index('id')
indices_map.head()

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5


In [130]:
indices.head(3)

title
Toy Story           0
Jumanji             1
Grumpier Old Men    2
dtype: int64

In [131]:
titles.head(3)

0           Toy Story
1             Jumanji
2    Grumpier Old Men
Name: title, dtype: object

In [132]:
def hybrid_recommender(userId,title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [x[0] for x in sim_scores]
    movies = smd.loc[movie_indices][['poster_path','title','vote_count','vote_average','year','id']]
    movies['est'] = movies['id'].apply(lambda x : svd.predict(userId,indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est',ascending=False)
    
    return movies.head(10)

In [133]:
hybrid_recommender(1,'Casino')
HTML(hybrid_recommender(1,'Casino').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,id,est
1001,,Raging Bull,968.0,7.7,1980,1578,3.460889
4480,,City of God,1852.0,8.2,2002,598,3.444383
994,,The Godfather: Part II,3418.0,8.3,1974,240,3.283102
6599,,The Departed,4455.0,7.9,2006,1422,3.265151
2441,,Drugstore Cowboy,117.0,7.0,1989,476,3.162636
8544,,The Wolf of Wall Street,6768.0,7.9,2013,106646,3.153815
101,,Taxi Driver,2632.0,8.1,1976,103,3.152231
5376,,Mean Streets,359.0,7.2,1973,203,3.145608
3344,,Pixote,24.0,8.4,1981,42148,3.105475
154,,Kids,280.0,6.8,1995,9344,3.086253


In [137]:
hybrid_recommender(120,'Casino')
HTML(hybrid_recommender(120,'Casino').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,id,est
6599,,The Departed,4455.0,7.9,2006,1422,4.22446
1001,,Raging Bull,968.0,7.7,1980,1578,4.133197
4480,,City of God,1852.0,8.2,2002,598,4.082392
101,,Taxi Driver,2632.0,8.1,1976,103,4.06216
8544,,The Wolf of Wall Street,6768.0,7.9,2013,106646,4.033398
994,,The Godfather: Part II,3418.0,8.3,1974,240,3.982127
154,,Kids,280.0,6.8,1995,9344,3.916602
1000,,Once Upon a Time in America,1104.0,8.3,1984,311,3.839149
4439,,Gangs of New York,1964.0,7.1,2002,3131,3.800701
4465,,The King of Comedy,306.0,7.6,1982,262,3.7533


We can clearly see that Recommendation for userId =1 & userID=120 are changed, based on users ratings history(indirectly say his liking)

In [134]:
hybrid_recommender(4,'Mad Max: Fury Road')
HTML(hybrid_recommender(4,'Mad Max: Fury Road').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,id,est
522,,Terminator 2: Judgment Day,4274.0,7.7,1991,280,4.815166
2973,,Mad Max 2: The Road Warrior,981.0,7.3,1981,8810,4.710628
31,,Twelve Monkeys,2470.0,7.4,1995,63,4.645206
2451,,The Omega Man,147.0,6.0,1971,11234,4.543042
2972,,Mad Max,1235.0,6.6,1979,9659,4.432499
1816,,Six-String Samurai,36.0,5.8,1998,24746,4.335143
3289,,Cherry 2000,67.0,6.0,1987,15785,4.321127
9005,,Independence Day: Resurgence,2550.0,4.9,2016,47933,4.316181
7396,,Pandorum,783.0,6.5,2009,19898,4.296386
8701,,Dawn of the Planet of the Apes,4511.0,7.3,2014,119450,4.272018


In [138]:
hybrid_recommender(40,'Mad Max: Fury Road')
HTML(hybrid_recommender(40,'Mad Max: Fury Road').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,id,est
522,,Terminator 2: Judgment Day,4274.0,7.7,1991,280,4.60342
31,,Twelve Monkeys,2470.0,7.4,1995,63,4.446545
2972,,Mad Max,1235.0,6.6,1979,9659,4.324959
7396,,Pandorum,783.0,6.5,2009,19898,4.316067
7502,,The Book of Eli,2207.0,6.6,2010,20504,4.241109
2973,,Mad Max 2: The Road Warrior,981.0,7.3,1981,8810,4.231877
6576,,The Covenant,295.0,5.2,2006,9954,4.210226
9005,,Independence Day: Resurgence,2550.0,4.9,2016,47933,4.188934
6173,,Sexmission,49.0,7.4,1984,19673,4.150092
7208,,Replicant,93.0,5.0,2001,10596,4.130289
