In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import evaluate

import warnings; warnings.simplefilter('ignore')

In [8]:
md = pd.read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [11]:
m = vote_counts.quantile(0.95)
m

434.0

In [12]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [13]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

Weighted Rating (WR) =  (vv+m.R)+(mv+m.C) 
where,

v is the number of votes for the movie
m is the minimum votes required to be listed in the chart
R is the average rating of the movie
C is the mean vote across the whole report


In [14]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [15]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [16]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [17]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [20]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)
s

0        Animation
0           Comedy
0           Family
1        Adventure
1          Fantasy
           ...    
45461       Family
45462        Drama
45463       Action
45463        Drama
45463     Thriller
Name: genre, Length: 91106, dtype: object

Now selecting 85 percentile and building chart for each genre

In [21]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [23]:
build_chart('Thriller').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.95646
12481,The Dark Knight,2008,12269,8,123.167,7.950165
292,Pulp Fiction,1994,8670,8,140.95,7.929996
46,Se7en,1995,5915,8,18.4574,7.898573
24860,The Imitation Game,2014,5895,8,31.5959,7.898242
586,The Silence of the Lambs,1991,4549,8,4.30722,7.869538
11354,The Prestige,2006,4510,8,16.9456,7.868463
289,Leon: The Professional,1994,4293,8,20.4773,7.862142
4099,Memento,2000,4168,8,15.4508,7.858217
1213,The Shining,1980,3890,8,19.6116,7.848633


Until now we created a popularity based recommendor system. It'll give same recommendation to everyone and behaves more like a filter that will give you top popular movies in each genre. Still good to have feature but this doesn't offer a personalization. For ex if one likes Inception, Dark Knight and memento that by inference it can be understood that the person likes thriller and christopher Nolan as these are all directed by him, This will help in suggesting his other movies as well like Tenet & dunkirk which are more recent ones which user might not be aware of. 
Content based recommendor is one such solution for that personalization. It checks the content of dataset, here movies i.e its features like genre, director, cast, cinematography, music etc. It checks similarities based on features mentioned before between movies that are like by user and other similar candidate movies, rank them score them and suggest top movies from resulting set. Since it used metadata of movies its called content based filtering.

In [5]:
pip install evaluate


Collecting evaluate
  Downloading https://files.pythonhosted.org/packages/90/50/0cc73b299fd941cb12d7ed39e0ccf8e18fe78dd6c16b951abe5477b3cd82/evaluate-0.0.3.tar.gz
Building wheels for collected packages: evaluate
  Building wheel for evaluate (setup.py) ... [?25l[?25hdone
  Created wheel for evaluate: filename=evaluate-0.0.3-cp36-none-any.whl size=6862 sha256=40092c687a0bb9ed9fbfa1ed3ac5fee77537d237a3919756a3f25bf90ebeafeb
  Stored in directory: /root/.cache/pip/wheels/de/51/a5/ebdce3e18b99539f31d3624ed21ca88ab3841617eb82628b05
Successfully built evaluate
Installing collected packages: evaluate
Successfully installed evaluate-0.0.3


Using movie overviews and taglines first to get recommendations.

In [25]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [26]:
md = md.drop([19730, 29503, 35587])

In [27]:
md['id'] = md['id'].astype('int')

In [28]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [29]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [30]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [32]:
tfidf_matrix.shape

(9099, 268124)

Now to find out similarities between movies we can use cosine similarities. Instead of using cosine formula we'll use directly the linear_kernel model.This will get me pairwise cosine similarities.

In [33]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [34]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [42]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
Shin Godzilla                                         9094
The Beatles: Eight Days a Week - The Touring Years    9095
Pokémon: Spell of the Unknown                         9096
Pokémon 4Ever: Celebi - Voice of the Forest           9097
Force Majeure                                         9098
Length: 9099, dtype: int64

In [43]:
# selecting 30 similar movies for a given movie
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [45]:
get_recommendations('Inception').head(10)

5239                              Cypher
141                                Crumb
6398                         Renaissance
653                            Lone Star
1703                               House
4739                    The Pink Panther
319                                 Cobb
2828    What Ever Happened to Baby Jane?
8867                     Pitch Perfect 2
979          Once Upon a Time in America
Name: title, dtype: object

In [48]:
get_recommendations('Batman Forever').head(10)

7931                      The Dark Knight Rises
2579               Batman: Mask of the Phantasm
6900                            The Dark Knight
6144                              Batman Begins
8165    Batman: The Dark Knight Returns, Part 1
524                                      Batman
1240                             Batman & Robin
1113                             Batman Returns
7565                 Batman: Under the Red Hood
7901                           Batman: Year One
Name: title, dtype: object

When 'batman' is selected as user liked movie, suggestion are more reasonable and likely to be watched by user but for inception this is not the case. So now we'll use other features from metadata like cast, so we'll another dataset as well like credits and keywords.

In [50]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')


In [51]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [52]:
md.shape

(45463, 25)

In [53]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [54]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

Now from cast and crew, we'll look directly at only director and top 3 actors


In [55]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [56]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [57]:
smd['director'] = smd['crew'].apply(get_director)

In [58]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [59]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

Now we create a df with genre,director, mainactor and keywords. We'll change the case of all words to lowercase & remove whitespaces for simplicity of model and to director presence in a movie more weight we'll give it 3 times.

In [60]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [61]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])
smd['director']

0               [johnlasseter, johnlasseter, johnlasseter]
1                  [joejohnston, joejohnston, joejohnston]
2               [howarddeutch, howarddeutch, howarddeutch]
3         [forestwhitaker, forestwhitaker, forestwhitaker]
4               [charlesshyer, charlesshyer, charlesshyer]
                               ...                        
40952        [greggchampion, greggchampion, greggchampion]
41172    [tinusureshdesai, tinusureshdesai, tinusureshd...
41225    [ashutoshgowariker, ashutoshgowariker, ashutos...
41391              [hideakianno, hideakianno, hideakianno]
41669                    [ronhoward, ronhoward, ronhoward]
Name: director, Length: 9219, dtype: object

In [62]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [63]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [64]:
s = s[s > 1]

Stemming words to avoid confusion and redundancy

In [65]:
stemmer = SnowballStemmer('english')

In [66]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words


In [67]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [68]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [69]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [70]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [71]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [74]:
get_recommendations('Inception').head(10)

6623                             The Prestige
3381                                  Memento
4145                                 Insomnia
2085                                Following
8031                    The Dark Knight Rises
8613                             Interstellar
6981                          The Dark Knight
6218                            Batman Begins
5638    Sky Captain and the World of Tomorrow
8500                                  Don Jon
Name: title, dtype: object

In [75]:
get_recommendations('Just Like Heaven').head(10)

3319               Head Over Heels
7332    Ghosts of Girlfriends Past
1329              The House of Yes
5207                    Mean Girls
7905         Mr. Popper's Penguins
6959     The Spiderwick Chronicles
4763                 Freaky Friday
5742                        Topper
5202                13 Going on 30
1677                        Splash
Name: title, dtype: object

Now the results personalized to what user likes. When we selected 'inception' as user like movie it gave movies of chrisopher nolan and of thriller genre & some cast similarities as well. But it still recommends movies which doesn't have good ratings but matches other criterias. So to improve that we'll use popularity based recommendor systems logic based on ratings and calculate weighted rating as we did in very first approach. Much like an hybrid approach of above two approaches.

In [76]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [78]:
improved_recommendations('Inception')

Unnamed: 0,title,vote_count,vote_average,year,wr
6981,The Dark Knight,12269,8,2008,7.905871
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
4173,Minority Report,2663,7,2002,6.754048
8207,Looper,4777,6,2012,5.937111
7286,X-Men Origins: Wolverine,4086,6,2009,5.927497
7903,Green Lantern,2551,5,2011,5.035606


Now the better rated movies will be at top and other with low ratings are at bottom. No surprise we get 'Green Lantern' at the bottom of that table!!! 

This content base recommender have some shortcomings like it can only suggest movies similar to what user has already liked and it doesn't give recommendation of movies which are highly rated but are different from liked movie's genre. Also here by content we meant metadata of movies we didn't consider user's personalization or history or trailers he watched.For now it'll suggest uniformly to all users.

Above limitation can be resolved with Collaborative filtering based recommendor system. This will give recommendations based on tastes of other user who have similar taste with current user in movies. Movies which are highly rated, reviewed and more watched by other across genre & and with different cast & crew can be appealing to current user as well. To implement that part we'll use surprise library. Surprise library uses SVD algorithm (Singular Value Decomposition) which minimizes RMSE(Root mean square error) .

In [79]:
reader = Reader()

In [80]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [84]:
from surprise.model_selection import KFold
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(data)

<generator object KFold.split at 0x7fb08fc20990>

In [95]:
svd = SVD()
from surprise.model_selection import cross_validate
cross_validate(svd, data)

{'fit_time': (4.882619142532349,
  4.894801139831543,
  4.909840822219849,
  4.88627815246582,
  4.952606439590454),
 'test_mae': array([0.69080412, 0.68466344, 0.69377581, 0.69259092, 0.69316062]),
 'test_rmse': array([0.89304028, 0.89003054, 0.89884104, 0.90286929, 0.90204209]),
 'test_time': (0.14941930770874023,
  0.2917051315307617,
  0.14209413528442383,
  0.14158034324645996,
  0.28275227546691895)}

In [100]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb090215908>

In [101]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [102]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.727370596698594, details={'was_impossible': False})

This gives us a floating number on the question whether the user ID 1 will like movie ID 302 and that is equal to 2.7. Based on these values we can select movies with highest values to suggest to user 1.

Now We'll try a hybrid approach again by combining both content based and collaborative filtering systems. For a given user ID and movie we'll suggest movies which are similar to movie selected and movies which are rated highly by other users across genre.

In [103]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [105]:
id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [107]:
indices_map = id_map.set_index('id')

In [108]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [109]:
hybrid(1, 'Batman Begins')

Unnamed: 0,title,vote_count,vote_average,year,id,est
7648,Inception,14075.0,8.1,2010,27205,3.252438
6981,The Dark Knight,12269.0,8.3,2008,155,3.220544
6623,The Prestige,4510.0,8.0,2006,1124,3.111318
8613,Interstellar,11187.0,8.1,2014,157336,3.099074
3381,Memento,4168.0,8.1,2000,77,3.093813
3344,Pixote,24.0,8.4,1981,42148,3.047739
4145,Insomnia,1181.0,6.8,2002,320,2.781018
7513,Ninja,87.0,5.4,2009,25602,2.772983
6645,Harsh Times,198.0,6.3,2005,7873,2.725729
8031,The Dark Knight Rises,9263.0,7.6,2012,49026,2.709368


In [110]:
hybrid(1, 'Green Lantern')

Unnamed: 0,title,vote_count,vote_average,year,id,est
2972,Mad Max,1235.0,6.6,1979,9659,3.047857
6638,Casino Royale,3930.0,7.3,2006,36557,2.956174
2834,Predator,2129.0,7.3,1987,106,2.925925
922,The Abyss,822.0,7.1,1989,2756,2.826077
6799,Transformers,4113.0,6.6,2007,1858,2.757227
4635,X2,3572.0,6.8,2003,36658,2.7329
1585,The Mask of Zorro,1211.0,6.3,1998,9342,2.695505
7828,I Am Number Four,1606.0,5.9,2011,46529,2.688933
9,GoldenEye,1194.0,6.6,1995,710,2.673744
7523,Edge of Darkness,496.0,6.2,2010,12201,2.654879


In [3]:
pip install surprise


Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.2MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670953 sha256=ef927df2178e583321ce3c73c2e3e03808e59b626dd3b8f30735b1cbcb81f3fd
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
