# Recommender System Example #1

[Movie Recommender Systems on Kaggle](https://www.kaggle.com/rounakbanik/movie-recommender-systems)


In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

# depreciated
# from surprise import evaluate


In [2]:
# reading csv from movie.ipynb output for a cleaned csv based on movies_metadata.csv
md = pd.read_csv('../input/movies_cleaned.csv')
md.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,,en,Toy Story,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,False,,65000000.0,"['Adventure', 'Fantasy', 'Family']",,8844,,en,Jumanji,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"['Romance', 'Comedy']",,15602,,en,Grumpier Old Men,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,False,,16000000.0,"['Comedy', 'Drama', 'Romance']",,31357,,en,Waiting to Exhale,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,['Comedy'],,11862,,en,Father of the Bride Part II,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## A Simple Top Movies Listing based on different genres
From the previous study, we are able to summarize all movies into 32 different genres. By feeding build_top_chart function for different genres, we are able to pull out movies with top vote_average. Then, we filter out those that are more trust worthy ones (where movies that have vote counts in the upper 0.05% (or above 99.95%) quantile and show it to end_user

In [3]:
import re

def get_top_chart_by_genre(genre, quantile=0.995):
    qualified_df = md[md['vote_count'] > md['vote_count'].quantile(quantile)].sort_values('vote_average', ascending=False)
    genre_filtered_df = qualified_df['genres'].str.contains(genre, flags=re.IGNORECASE, regex=True)
    if genre != '':
        # return qualified_df[genre_filtered_df]
        return qualified_df[genre_filtered_df].index
    else:
        # return qualified_df
        return qualified_df.index
        

In [4]:
idx = get_top_chart_by_genre('')
# idx.shape
qf_df = md[md.index.isin(idx)]
qf_df[['title', 'release_date', 'vote_average', 'vote_count' ]].sort_values('vote_average', ascending=False).head(10)

Unnamed: 0,title,release_date,vote_average,vote_count
828,The Godfather,1972-03-14,8.5,6024.0
313,The Shawshank Redemption,1994-09-23,8.5,8358.0
521,Schindler's List,1993-11-29,8.3,4436.0
12421,The Dark Knight,2008-07-16,8.3,12269.0
2198,Life Is Beautiful,1997-12-20,8.3,3643.0
2828,Fight Club,1999-10-15,8.3,9678.0
23496,Whiplash,2014-10-10,8.3,4376.0
291,Pulp Fiction,1994-09-10,8.3,8670.0
5453,Spirited Away,2001-07-20,8.3,3968.0
350,Forrest Gump,1994-07-06,8.2,8147.0


In [5]:
idx = get_top_chart_by_genre('Romance')
# idx.shape
qf_df = md[md.index.isin(idx)]
qf_df[['title', 'release_date', 'vote_average', 'vote_count' ]].sort_values('vote_average', ascending=False).head(10)

Unnamed: 0,title,release_date,vote_average,vote_count
350,Forrest Gump,1994-07-06,8.2,8147.0
7168,Eternal Sunshine of the Spotless Mind,2004-03-19,7.9,3758.0
22003,Her,2013-12-18,7.9,4215.0
40458,La La Land,2016-11-29,7.9,4745.0
23337,The Fault in Our Stars,2014-05-16,7.6,3868.0
1628,Titanic,1997-11-18,7.5,7770.0
2165,Edward Scissorhands,1990-12-05,7.5,3731.0
580,Aladdin,1992-11-25,7.4,3495.0
20762,The Great Gatsby,2013-05-10,7.3,3885.0
19598,Silver Linings Playbook,2012-09-08,7.0,4840.0


## Content Based Recommender System
The previous method can only show top rated movies by all voters. However, we want something that is tailored to an individual user. We will try different ways to recommend movies to our end users. First, we will try to find movies that are similiar.

## Recommender System based on text mining of Movie Descriptor 
We will try to suggest movies based on keywords from the descriptive text provided in the database, and we will use this info to find the best match for end user.

In [6]:
md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')
md['keywords'] = md['tagline'] + ' ' + md['overview']

In [7]:
# We build a Term Frequency (TF)-Inverse Data Frequency (IDF) summary of keywords using scikit-learn library
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),min_df=0, stop_words='english')
X = vectorizer.fit_transform(md['keywords'])

# Show TF-IDF Vectorizer properties
# print('Shape: ')
# print(X.shape)
# print('Feature Names: ')
# print(vectorizer.get_feature_names())

# Next We build a lookup matrix that share similarity score for all movies title in the database
cosine_similarity = linear_kernel(X,X)
# showing cosine_similiarty characteristics
# cosine_similarity.shape

In [8]:

def get_recommended_movies_by_title(title):
    #a = md.index[md['Title'] == 'The Godfather']
    a = md[md['title'] == title]
    if a.empty:
        #print('Is Empty')
        return []
    else:
        # print('Found Title')
        # print(a.index)
        
        b = a.index.astype('int')
        # print(b)
        
        # slice the cosine_similiarity matrix for this specific title
        c = cosine_similarity[b]
        d = c.tolist()
        e = list(*d)
        f = list(enumerate(e))
        g = sorted(f, key=lambda x:x[1], reverse=True)
        g = g[1:31]
        movies_id = [x[0] for x in g]
        return movies_id

In [9]:
movie_lst = get_recommended_movies_by_title('Family Business')
rec_df = md[md.index.isin(movie_lst)]
rec_df

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
795,801,False,,0.0,"['Comedy', 'Foreign']",,9098,,de,Echte Kerle,...,0.0,100.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,Macho cop finds himself in a relationship with...,Regular Guys,False,5.2,9.0,Macho cop finds himself in a relationship with...
3909,3928,False,"{'id': 107469, 'name': 'Save The Last Dance Co...",13000000.0,"['Drama', 'Family', 'Romance', 'Music']",,9816,,en,Save the Last Dance,...,91038276.0,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Only Person You Need To Be Is Yourself.,Save the Last Dance,False,6.3,359.0,The Only Person You Need To Be Is Yourself. A ...
3984,4004,False,,0.0,"['Comedy', 'Fantasy']",,2608,,en,Maid to Order,...,9868521.0,93.0,"[{'iso_639_1': 'es', 'name': 'Español'}, {'iso...",Released,She was raised in a Beverly Hills mansion. Now...,Maid to Order,False,5.2,17.0,She was raised in a Beverly Hills mansion. Now...
4458,4482,False,,20000000.0,"['Crime', 'Drama']",,505,,en,Johnny Handsome,...,7237794.0,94.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"They changed his looks, his life and his futur...",Johnny Handsome,False,6.1,49.0,"They changed his looks, his life and his futur..."
7080,7120,False,,0.0,"['Comedy', 'Thriller']",,29493,,en,A Piece of the Action,...,0.0,135.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A delightfully delicious dilemma!,A Piece of the Action,False,6.5,4.0,A delightfully delicious dilemma! How does ret...
8930,8978,False,"{'id': 124951, 'name': 'Mannequin Collection',...",13000000.0,"['Fantasy', 'Comedy', 'Science Fiction', 'Roma...",,34376,,en,Mannequin Two: On the Move,...,3752426.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A lively comedy about a living doll!,Mannequin Two: On the Move,False,4.6,28.0,A lively comedy about a living doll! Although ...
11347,11406,False,,4000000.0,"['Comedy', 'Crime']",,9809,,en,Let's Go to Prison,...,4630045.0,84.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Welcome to the slammer,Let's Go to Prison,False,5.5,70.0,Welcome to the slammer When a career criminal'...
11451,11510,False,,0.0,"['Drama', 'Comedy', 'Romance']",,13771,,en,Venus,...,0.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Venus,False,6.5,48.0,Maurice is an aging veteran actor who becomes...
12380,12440,False,,0.0,"['Comedy', 'Family']",http://disneydvd.disney.go.com/college-road-tr...,13493,,en,College Road Trip,...,68397662.0,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,College Road Trip,False,5.0,91.0,When an overachieving high school student dec...
15581,15659,False,,0.0,"['Drama', 'Romance']",http://www.gretathemovie.com/,26035,,en,According to Greta,...,0.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,She's a trip. But she's no vacation.,According to Greta,False,6.0,67.0,She's a trip. But she's no vacation. Greta is ...


In [10]:
movie_lst = get_recommended_movies_by_title('Batman Forever')
rec_df = md[md.index.isin(movie_lst)]
rec_df

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
584,585,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",35000000.0,"['Fantasy', 'Action']",,268,,en,Batman,...,411348900.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Have you ever danced with the devil in the pal...,Batman,False,7.0,2145.0,Have you ever danced with the devil in the pal...
1321,1328,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",80000000.0,"['Action', 'Fantasy']",,364,,en,Batman Returns,...,280000000.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The Bat, the Cat, the Penguin.",Batman Returns,False,6.6,1706.0,"The Bat, the Cat, the Penguin. Having defeated..."
1482,1491,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",125000000.0,"['Action', 'Crime', 'Fantasy']",,415,,en,Batman & Robin,...,238207100.0,125.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Strength. Courage. Honor. And loyalty.,Batman & Robin,False,4.2,1447.0,Strength. Courage. Honor. And loyalty. Along w...
3079,3095,False,"{'id': 421904, 'name': 'Batman (DC Universe An...",6000000.0,"['Action', 'Adventure', 'Animation', 'Family']",,14919,,en,Batman: Mask of the Phantasm,...,5617391.0,76.0,"[{'iso_639_1': 'cs', 'name': 'Český'}, {'iso_6...",Released,The Dark Knight fights to save Gotham city fro...,Batman: Mask of the Phantasm,False,7.4,218.0,The Dark Knight fights to save Gotham city fro...
7868,7912,False,,0.0,"['Drama', 'Mystery', 'Thriller']",,10742,,en,The Clearing,...,12520800.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Clearing,False,5.6,57.0,"When affluent executive, Wayne Hayes is kidna..."
9181,9230,False,"{'id': 379475, 'name': 'Batman Beyond Collecti...",0.0,"['Animation', 'Family']",,16234,,en,Batman Beyond: Return of the Joker,...,0.0,74.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"This Time, The Joker Is Wild",Batman Beyond: Return of the Joker,False,7.5,152.0,"This Time, The Joker Is Wild The Joker is back..."
10070,10122,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",150000000.0,"['Action', 'Crime', 'Drama']",http://www2.warnerbros.com/batmanbegins/index....,272,,en,Batman Begins,...,374218700.0,140.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Evil fears the knight.,Batman Begins,False,7.5,7511.0,"Evil fears the knight. Driven by tragedy, bill..."
11693,11753,False,,15500000.0,"['Mystery', 'Crime', 'Drama', 'Thriller']",,18777,,en,Slow Burn,...,1237615.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The truth is just a trick of light.,Slow Burn,False,5.5,16.0,The truth is just a trick of light. A district...
12421,12481,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000.0,"['Drama', 'Action', 'Crime', 'Thriller']",http://thedarkknight.warnerbros.com/dvdsite/,155,,en,The Dark Knight,...,1004558000.0,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,Why So Serious? Batman raises the stakes in hi...
12794,12856,False,,3500000.0,"['Animation', 'Action', 'Adventure']",http://www.warnervideo.com/batmangothamknight/,13851,,en,Batman: Gotham Knight,...,0.0,75.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Get Ready... to Rage Against Evil.,Batman: Gotham Knight,False,6.7,171.0,Get Ready... to Rage Against Evil. Explore Bru...


# UP TO HERE

### Collaborative Filtering

In [56]:
reader = Reader()


In [57]:
ratings = pd.read_csv('../input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [58]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# data.split(n_folds=5)

In [59]:
# svd = SVD()
# evaluate(svd, data, measures=['RMSE', 'MAE'])

In [60]:
# trainset = data.build_full_trainset()
# svd.train(trainset)

In [61]:
# ratings[ratings['userId'] == 1]


In [62]:
# svd.predict(1, 302, 3)


# Hybrid Recommender (not working)

In [63]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [64]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [65]:
indices_map = id_map.set_index('id')


In [66]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [67]:
# hybrid(1, 'Avatar')


In [68]:
# hybrid(500, 'Avatar')

In [69]:
print('hello world')

hello world
