In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd. read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

## Content Based Recommender
 Content Based Recommenders based on:
* Movie Overviews and Taglines
* Movie Cast, Crew, Keywords and Genre


In [17]:
links_small = pd.read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
md = md.drop([19730, 29503, 35587])

In [6]:
md['id'] = md['id'].astype('int')

In [7]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 24)

### Description based recommendation

In [8]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [9]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [10]:

tfidf_matrix.shape

(9099, 268124)

In [11]:
#Cosine similarity calculation off all the movies pairwise
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [13]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [14]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [16]:
get_recommendations('Toy Story').head(10)

2502               Toy Story 2
7535               Toy Story 3
6193    The 40 Year Old Virgin
2547           Man on the Moon
6627              Factory Girl
4702    What's Up, Tiger Lily?
889      Rebel Without a Cause
6554    For Your Consideration
4988          Rivers and Tides
1599                 Condorman
Name: title, dtype: object

### Metadata Based Recommender


In [18]:
credits = pd.read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/credits.csv')
keywords = pd.read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/keywords.csv')

In [19]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [20]:
md.shape

(45463, 24)

In [21]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [22]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 27)

In [23]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [24]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [25]:
smd['director'] = smd['crew'].apply(get_director)

In [26]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [27]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [28]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [29]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x])

In [30]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [31]:
s = s.value_counts()


In [32]:
s = s[s > 1]

In [33]:
stemmer = SnowballStemmer('english')


In [34]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [35]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [36]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [37]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [38]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [39]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [40]:
get_recommendations('The Dark Knight').head(10)

8031                 The Dark Knight Rises
6218                         Batman Begins
7659            Batman: Under the Red Hood
6623                          The Prestige
1134                        Batman Returns
8927               Kidnapping Mr. Heineken
5943                              Thursday
1260                        Batman & Robin
2085                             Following
9024    Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [41]:
get_recommendations('Toy Story').head(10)

2522                                    Toy Story 2
6386                                       Luxo Jr.
8519                           Toy Story of Terror!
7914                                         Cars 2
6496                                           Cars
1883                                   A Bug's Life
2751                              Creature Comforts
7629                                    Toy Story 3
1432                               Meet the Deedles
4341    The Looney, Looney, Looney Bugs Bunny Movie
Name: title, dtype: object

In [42]:
get_recommendations('Pulp Fiction').head(10)

1381         Jackie Brown
8905    The Hateful Eight
5200    Kill Bill: Vol. 2
4595                Basic
4764             S.W.A.T.
898        Reservoir Dogs
6939              Cleaner
4903    Kill Bill: Vol. 1
231         Kiss of Death
4306       The 51st State
Name: title, dtype: object

#### Popularity and Ratings

Include Popularity and ratings to improve basic recommender

In [53]:
vote_counts = smd[smd['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = smd[smd['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95)
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)


In [54]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [51]:
smd['year'] = pd.to_datetime(smd['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [55]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
6623,The Prestige,4510,8,2006,7.344034
8031,The Dark Knight Rises,9263,7,2012,6.801517
6218,Batman Begins,7511,7,2005,6.765201
7659,Batman: Under the Red Hood,459,7,2010,6.108968
2085,Following,363,7,1998,6.073726
1134,Batman Returns,1706,6,1992,5.951183
7561,Harry Brown,351,6,2009,5.923794
8026,Bullet to the Head,490,5,2013,5.736101
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.203189
1260,Batman & Robin,1447,4,1997,5.123362


In [57]:
improved_recommendations('Toy Story')

Unnamed: 0,title,vote_count,vote_average,year,wr
3833,"Monsters, Inc.",6150,7,2001,6.726297
7629,Toy Story 3,4710,7,2010,6.668116
2522,Toy Story 2,3914,7,1999,6.623925
8595,The Lego Movie,3127,7,2014,6.566911
6496,Cars,3991,6,2006,5.969604
1883,A Bug's Life,2379,6,1998,5.958577
7404,Cloudy with a Chance of Meatballs,1799,6,2009,5.952358
3016,Chicken Run,1190,6,2000,5.943442
6534,Monster House,912,6,2006,5.938159
7914,Cars 2,2088,5,2011,5.452808


### Collaborative Filtering
Using SVD

In [58]:
reader = Reader()

In [60]:
ratings = pd.read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [61]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [62]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8892
MAE:  0.6841
------------
Fold 2
RMSE: 0.8928
MAE:  0.6865
------------
Fold 3
RMSE: 0.8956
MAE:  0.6892
------------
Fold 4
RMSE: 0.8969
MAE:  0.6913
------------
Fold 5
RMSE: 0.9100
MAE:  0.7002
------------
------------
Mean RMSE: 0.8969
Mean MAE : 0.6903
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8891512163483044,
                             0.8927853722546478,
                             0.8956313597952894,
                             0.8968880468716262,
                             0.9100344554731602],
                            'mae': [0.6841332893813702,
                             0.6865419919462105,
                             0.6892186748757612,
                             0.6912702396694723,
                             0.7001882071103928]})

In [63]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x233f32c2390>

Let us pick user 5000 and check the ratings s/he has given.

In [64]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [65]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.7428131631756987, details={'was_impossible': False})

For movie with ID 302, we get an estimated prediction of **2.686**. One startling feature of this recommender system is that it doesn't care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have predicted the movie.

###  Hybrid Recommender


In [66]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [67]:
id_map = pd.read_csv('C:/Users/varsut/Desktop/Project/Datasets/Input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [68]:
indices_map = id_map.set_index('id')

In [69]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [71]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.29623
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.082308
1011,The Terminator,4208.0,7.4,1984,218,3.069422
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,2.99708
2014,Fantastic Planet,140.0,7.6,1973,16306,2.980192
7705,Alice in Wonderland,8.0,5.4,1933,25694,2.855524
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.843255
3060,Sinbad and the Eye of the Tiger,39.0,6.3,1977,11940,2.838353
974,Aliens,3282.0,7.7,1986,679,2.805011
8865,Star Wars: The Force Awakens,7993.0,7.5,2015,140607,2.793212


In [67]:
hybrid(500, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
2014,Fantastic Planet,140.0,7.6,1973,16306,3.367714
974,Aliens,3282.0,7.7,1986,679,3.202698
1011,The Terminator,4208.0,7.4,1984,218,3.186286
7705,Alice in Wonderland,8.0,5.4,1933,25694,3.171674
2834,Predator,2129.0,7.3,1987,106,3.127977
8724,Jupiter Ascending,2816.0,5.2,2015,76757,3.116097
922,The Abyss,822.0,7.1,1989,2756,3.050969
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.034974
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.007242
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.888528
