# Recommender System Example #1

[Movie Recommender Systems on Kaggle](https://www.kaggle.com/rounakbanik/movie-recommender-systems)


In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

# depreciated
# from surprise import evaluate


In [2]:
# reading csv from movie.ipynb output for a cleaned csv based on movies_metadata.csv
md = pd.read_csv('../input/movies_cleaned.csv')
md.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,,en,Toy Story,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,False,,65000000.0,"['Adventure', 'Fantasy', 'Family']",,8844,,en,Jumanji,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"['Romance', 'Comedy']",,15602,,en,Grumpier Old Men,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,False,,16000000.0,"['Comedy', 'Drama', 'Romance']",,31357,,en,Waiting to Exhale,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,['Comedy'],,11862,,en,Father of the Bride Part II,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## A Simple Top Movies Listing based on different genres
From the previous study, we are able to summarize all movies into 32 different genres. By feeding build_top_chart function for different genres, we are able to pull out movies with top vote_average. Then, we filter out those that are more trust worthy ones (where movies that have vote counts in the upper 0.05% (or above 99.95%) quantile and show it to end_user

In [3]:
import re

def build_top_chart(genre, quantile=0.995):
    qualified_df = md[md['vote_count'] > md['vote_count'].quantile(quantile)].sort_values('vote_average', ascending=False)
    genre_filtered_df = qualified_df['genres'].str.contains(genre, flags=re.IGNORECASE, regex=True)
    if genre != '':
        return qualified_df[genre_filtered_df]
    else:
        return qualified_df

In [4]:
qf_df = build_top_chart('')
qf_df[['title', 'release_date', 'vote_average', 'vote_count' ]].sort_values('vote_average', ascending=False).head(10)

Unnamed: 0,title,release_date,vote_average,vote_count
828,The Godfather,1972-03-14,8.5,6024.0
313,The Shawshank Redemption,1994-09-23,8.5,8358.0
521,Schindler's List,1993-11-29,8.3,4436.0
12421,The Dark Knight,2008-07-16,8.3,12269.0
2198,Life Is Beautiful,1997-12-20,8.3,3643.0
2828,Fight Club,1999-10-15,8.3,9678.0
23496,Whiplash,2014-10-10,8.3,4376.0
291,Pulp Fiction,1994-09-10,8.3,8670.0
5453,Spirited Away,2001-07-20,8.3,3968.0
288,Leon: The Professional,1994-09-14,8.2,4293.0


In [5]:
qf_df = build_top_chart('Romance')
qf_df[['title', 'release_date', 'vote_average', 'vote_count' ]].sort_values('vote_average', ascending=False).head(10)

Unnamed: 0,title,release_date,vote_average,vote_count
350,Forrest Gump,1994-07-06,8.2,8147.0
40458,La La Land,2016-11-29,7.9,4745.0
7168,Eternal Sunshine of the Spotless Mind,2004-03-19,7.9,3758.0
22003,Her,2013-12-18,7.9,4215.0
23337,The Fault in Our Stars,2014-05-16,7.6,3868.0
2165,Edward Scissorhands,1990-12-05,7.5,3731.0
1628,Titanic,1997-11-18,7.5,7770.0
580,Aladdin,1992-11-25,7.4,3495.0
20762,The Great Gatsby,2013-05-10,7.3,3885.0
19598,Silver Linings Playbook,2012-09-08,7.0,4840.0


## Content Based Recommender System
The previous method can only show top rated movies by all voters. However, we want something that is tailored to an individual user. We will try different ways to recommend movies to our end users. First, we will try to find movies that are similiar.

## Recommender System based on Movie Descriptor
We will try to suggest movies based on keywords from the descriptive text provided in the database, and we will use this info to find the best match.

In [6]:
md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')
md['keywords'] = md['tagline'] + ' ' + md['overview']

In [7]:
# We build a Term Frequency (TF)-Inverse Data Frequency (IDF) summary of keywords using scikit-learn library
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
X = vectorizer.fit_transform(md['keywords'])

# Show TF-IDF Vectorizer properties
# print('Shape: ')
# print(X.shape)
# print('Feature Names: ')
# print(vectorizer.get_feature_names())

# Next We build a lookup table that share similiarty score for all movies title in the database
cosine_similarity = linear_kernel(X,X)
# showing cosine_similiarty characteristics
# cosine_similarity.shape

In [8]:

def get_recommended_movie_id(title):
    #a = md.index[md['Title'] == 'The Godfather']
    a = md[md['title'] == title]
    if a.empty:
        #print('Is Empty')
        return []
    else:
        # print('Found Title')
        # print(a.index)
        
        b = a.index.astype('int')
        # print(b)
        
        # slice the cosine_similiarity matrix for this specific title
        c = cosine_similarity[b]
        d = c.tolist()
        e = list(*d)
        f = list(enumerate(e))
        g = sorted(f, key=lambda x:x[1], reverse=True)
        g = g[1:31]
        movies_id = [x[0] for x in g]
        return movies_id

In [12]:
movie_lst = get_recommended_movie_id('Family Business')
rec_df = md[md.index.isin(movie_lst)]
rec_df

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
306,307,False,,0.0,"['Horror', 'Drama']",,92769,,en,Relative Fear,...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Trusting your children can be deadly.,Relative Fear,False,4.7,3.0,Trusting your children can be deadly. Linda an...
781,787,False,,23000000.0,['Adventure'],,36344,,en,Alaska,...,0.0,109.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A missing father. A desperate search. An unfor...,Alaska,False,5.2,32.0,A missing father. A desperate search. An unfor...
828,834,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000.0,"['Drama', 'Crime']",http://www.thegodfather.com/,238,,en,The Godfather,...,245066411.0,175.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,An offer you can't refuse. Spanning the years ...
3984,4004,False,,0.0,"['Comedy', 'Fantasy']",,2608,,en,Maid to Order,...,9868521.0,93.0,"[{'iso_639_1': 'es', 'name': 'Español'}, {'iso...",Released,She was raised in a Beverly Hills mansion. Now...,Maid to Order,False,5.2,17.0,She was raised in a Beverly Hills mansion. Now...
4025,4045,False,,0.0,['Documentary'],http://www.irisfilms.org/films/long-nights-jou...,95682,,en,Long Night's Journey Into Day,...,0.0,94.0,"[{'iso_639_1': 'af', 'name': 'Afrikaans'}, {'i...",Released,Looking into a dark past for a bright future.,Long Night's Journey Into Day,False,2.5,2.0,Looking into a dark past for a bright future. ...
4458,4482,False,,20000000.0,"['Crime', 'Drama']",,505,,en,Johnny Handsome,...,7237794.0,94.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"They changed his looks, his life and his futur...",Johnny Handsome,False,6.1,49.0,"They changed his looks, his life and his futur..."
6911,6951,False,,0.0,['Comedy'],,51036,,en,MadHouse,...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The bad news is you have houseguests. There is...,MadHouse,False,5.7,18.0,The bad news is you have houseguests. There is...
8930,8978,False,"{'id': 124951, 'name': 'Mannequin Collection',...",13000000.0,"['Fantasy', 'Comedy', 'Science Fiction', 'Roma...",,34376,,en,Mannequin Two: On the Move,...,3752426.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A lively comedy about a living doll!,Mannequin Two: On the Move,False,4.6,28.0,A lively comedy about a living doll! Although ...
9484,9533,False,,0.0,"['Comedy', 'Drama']",,5183,,en,Travels with My Aunt,...,0.0,109.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Travels with My Aunt,False,6.3,8.0,A stodgy young man gets caught up in his free...
9543,9592,False,,0.0,"['Action', 'Crime', 'Drama']",,18927,,en,Stander,...,31651.0,116.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Good cop. Great criminal.,Stander,False,6.1,27.0,Good cop. Great criminal. The life and career ...


In [13]:
movie_lst = get_recommended_movie_id('Batman Forever')
rec_df = md[md.index.isin(movie_lst)]
rec_df

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
584,585,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",35000000.0,"['Fantasy', 'Action']",,268,,en,Batman,...,411348900.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Have you ever danced with the devil in the pal...,Batman,False,7.0,2145.0,Have you ever danced with the devil in the pal...
811,817,False,,0.0,"['Drama', 'Horror', 'Thriller']",,31417,,fr,Les Yeux sans visage,...,0.0,90.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Eyes Without a Face,False,7.5,130.0,Professor Genessier is guilt-stricken after h...
1321,1328,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",80000000.0,"['Action', 'Fantasy']",,364,,en,Batman Returns,...,280000000.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The Bat, the Cat, the Penguin.",Batman Returns,False,6.6,1706.0,"The Bat, the Cat, the Penguin. Having defeated..."
1482,1491,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",125000000.0,"['Action', 'Crime', 'Fantasy']",,415,,en,Batman & Robin,...,238207100.0,125.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Strength. Courage. Honor. And loyalty.,Batman & Robin,False,4.2,1447.0,Strength. Courage. Honor. And loyalty. Along w...
2466,2480,False,,2900000.0,"['Drama', 'Thriller']",,1902,,es,Abre los ojos,...,368234.0,117.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,,Open Your Eyes,False,7.4,211.0,A very handsome man finds the love of his lif...
3079,3095,False,"{'id': 421904, 'name': 'Batman (DC Universe An...",6000000.0,"['Action', 'Adventure', 'Animation', 'Family']",,14919,,en,Batman: Mask of the Phantasm,...,5617391.0,76.0,"[{'iso_639_1': 'cs', 'name': 'Český'}, {'iso_6...",Released,The Dark Knight fights to save Gotham city fro...,Batman: Mask of the Phantasm,False,7.4,218.0,The Dark Knight fights to save Gotham city fro...
7868,7912,False,,0.0,"['Drama', 'Mystery', 'Thriller']",,10742,,en,The Clearing,...,12520800.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Clearing,False,5.6,57.0,"When affluent executive, Wayne Hayes is kidna..."
9181,9230,False,"{'id': 379475, 'name': 'Batman Beyond Collecti...",0.0,"['Animation', 'Family']",,16234,,en,Batman Beyond: Return of the Joker,...,0.0,74.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"This Time, The Joker Is Wild",Batman Beyond: Return of the Joker,False,7.5,152.0,"This Time, The Joker Is Wild The Joker is back..."
10070,10122,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",150000000.0,"['Action', 'Crime', 'Drama']",http://www2.warnerbros.com/batmanbegins/index....,272,,en,Batman Begins,...,374218700.0,140.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Evil fears the knight.,Batman Begins,False,7.5,7511.0,"Evil fears the knight. Driven by tragedy, bill..."
10286,10338,False,,1000000.0,"['Horror', 'Thriller']",,10092,,en,Cry_Wolf,...,10047670.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Nobody believes a liar...even when they're tel...,Cry_Wolf,False,5.6,127.0,Nobody believes a liar...even when they're tel...


# UP TO HERE

### Metadata Based Recommender

In [29]:
credits = pd.read_csv('../input/credits.csv')
keywords = pd.read_csv('../input/keywords.csv')

In [30]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [31]:
md.shape


(45463, 25)

In [32]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [33]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [34]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [35]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [36]:
smd['director'] = smd['crew'].apply(get_director)


In [37]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [38]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [39]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])


In [40]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

### Keywords

In [41]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [42]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [43]:
s = s[s > 1]


In [44]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [45]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [46]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [47]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [48]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [49]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [50]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [51]:
get_recommendations('The Dark Knight').head(10)


8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [52]:
get_recommendations('Mean Girls').head(10)


3319               Head Over Heels
4763                 Freaky Friday
1329              The House of Yes
6277              Just Like Heaven
7905         Mr. Popper's Penguins
7332    Ghosts of Girlfriends Past
6959     The Spiderwick Chronicles
8883                      The DUFF
6698         It's a Boy Girl Thing
7377       I Love You, Beth Cooper
Name: title, dtype: object

### Popularity and Ratings

In [53]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [54]:
improved_recommendations('The Dark Knight')


Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
1134,Batman Returns,1706,6,1992,5.846862
132,Batman Forever,1529,5,1995,5.054144
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013943
1260,Batman & Robin,1447,4,1997,4.287233


In [55]:
improved_recommendations('Mean Girls')


Unnamed: 0,title,vote_count,vote_average,year,wr
1547,The Breakfast Club,2189,7,1985,6.709602
390,Dazed and Confused,588,7,1993,6.254682
8883,The DUFF,1372,6,2015,5.818541
3712,The Princess Diaries,1063,6,2001,5.781086
4763,Freaky Friday,919,6,2003,5.757786
6277,Just Like Heaven,595,6,2005,5.681521
6959,The Spiderwick Chronicles,593,6,2008,5.680901
7494,American Pie Presents: The Book of Love,454,5,2009,5.11969
7332,Ghosts of Girlfriends Past,716,5,2009,5.092422
7905,Mr. Popper's Penguins,775,5,2011,5.087912


### Collaborative Filtering

In [56]:
reader = Reader()


In [57]:
ratings = pd.read_csv('../input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [58]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# data.split(n_folds=5)

In [59]:
# svd = SVD()
# evaluate(svd, data, measures=['RMSE', 'MAE'])

In [60]:
# trainset = data.build_full_trainset()
# svd.train(trainset)

In [61]:
# ratings[ratings['userId'] == 1]


In [62]:
# svd.predict(1, 302, 3)


# Hybrid Recommender (not working)

In [63]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [64]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [65]:
indices_map = id_map.set_index('id')


In [66]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [67]:
# hybrid(1, 'Avatar')


In [68]:
# hybrid(500, 'Avatar')

In [69]:
print('hello world')

hello world
