In [1]:
import pandas as pd

# Recommender Systems by Group 8 in CSDA1040 Class Fall 2019

Work is based on [Movie Recommender Systems on Kaggle](https://www.kaggle.com/rounakbanik/movie-recommender-systems) with modification to codebase for fixes, clarifications and adaptation for DASH app.

In [2]:
# import pandas as pd
# Not Used
# %matplotlib inline
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from scipy import stats
# from ast import literal_eval

# NLP library including TF-IDF, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import wordnet

#Recommender System Library surprise
from surprise import Dataset, SVD
from surprise import Reader
from surprise.model_selection import cross_validate


from collections import defaultdict

# depreciated
# from surprise import evaluate


In [3]:
# reading csv from movie.ipynb output for a cleaned csv based on movies_metadata.csv
md = pd.read_csv('../input/movies_cleaned.csv')
md.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,,en,Toy Story,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,False,,65000000.0,"['Adventure', 'Fantasy', 'Family']",,8844,,en,Jumanji,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"['Romance', 'Comedy']",,15602,,en,Grumpier Old Men,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,False,,16000000.0,"['Comedy', 'Drama', 'Romance']",,31357,,en,Waiting to Exhale,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,['Comedy'],,11862,,en,Father of the Bride Part II,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
def show_movie_list_by_ids(ids, n=10):
    tmp_df = md[md.id.isin(ids)]
    print(tmp_df[['id', 'title', 'release_date']].head(n))

## A Simple Top Movies Listing based on different genres
From the previous study, we are able to summarize all movies into 32 different genres. By feeding get_top_chart_by_genre function for various genres, we are able to pull out movies with top vote_average. Then, we filter out those that are more trustworthy, the movies that have vote counts in the upper 0.05% (or above 99.95%) quantile and show it to end_user

In [5]:
import re

def get_top_chart_by_genre(genre, quantile=0.995):
    qualified_df = md[md['vote_count'] > md['vote_count'].quantile(quantile)].sort_values('vote_average', ascending=False)
    
    if genre != '':
        genre_filtered_df = qualified_df['genres'].str.contains(genre, flags=re.IGNORECASE, regex=True)
        # return qualified_df[genre_filtered_df]
        return qualified_df[genre_filtered_df].id
    else:
        # return qualified_df
        return qualified_df.id

In [6]:
ids = get_top_chart_by_genre('')
print('Top Movies Overall:')
show_movie_list_by_ids(ids)

Top Movies Overall:
       id                     title release_date
0     862                 Toy Story   1995-10-30
46    807                     Se7en   1995-09-22
255    11                 Star Wars   1977-05-25
288   101    Leon: The Professional   1994-09-14
291   680              Pulp Fiction   1994-09-10
313   278  The Shawshank Redemption   1994-09-23
350    13              Forrest Gump   1994-07-06
358  8587             The Lion King   1994-06-23
474   329             Jurassic Park   1993-06-11
521   424          Schindler's List   1993-11-29


In [7]:
ids = get_top_chart_by_genre('Romance')
print('Top Movies in Romance:')
show_movie_list_by_ids(ids)

Top Movies in Romance:
           id                                  title release_date
350        13                           Forrest Gump   1994-07-06
580       812                                Aladdin   1992-11-25
1628      597                                Titanic   1997-11-18
2165      162                    Edward Scissorhands   1990-12-05
7168       38  Eternal Sunshine of the Spotless Mind   2004-03-19
13071    8966                               Twilight   2008-11-20
19598   82693                Silver Linings Playbook   2012-09-08
20762   64682                       The Great Gatsby   2013-05-10
22003  152601                                    Her   2013-12-18
23262  102651                             Maleficent   2014-05-28


## Content Based Recommender System
The previous method can only show top rated movies by all voters. However, we want something that is more tailored to an individual user. The following sections of code will explore different ways to recommend movies to our end users based on user preferences. First, we will try to find movies that are similiar for a specific user selection.

## Recommender System based on text mining of Movie Descriptor 
We will try to suggest movies based on keywords from the descriptive text provided in the database, and we will use this info to find the best match for end user.

In [8]:
# We build a Term Frequency (TF)-Inverse Data Frequency (IDF) summary of keywords using scikit-learn library
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# by using scikit-learn library, we do not really need to clean up the text for tokenization. See example: https://medium.com/@vasista/preparing-the-text-data-with-scikit-learn-b31a3df567e

md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')
md['keywords'] = md['tagline'] + ' ' + md['overview']

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),min_df=0, stop_words='english')
X = vectorizer.fit_transform(md['keywords'])

# Show TF-IDF Vectorizer properties
# print('Shape: ')
# print(X.shape)
# print('Feature Names: ')
# print(vectorizer.get_feature_names())

# Next We build a lookup matrix that share similarity score for all movies title in the database
cosine_similarity = linear_kernel(X,X)
# showing cosine_similiarty characteristics
# cosine_similarity.shape

In [9]:
def get_recommended_movies_by_title(title, n=30):
    #a = md.index[md['Title'] == 'The Godfather']
    a = md[md['title'] == title]
    if a.empty:
        #print('Is Empty')
        return []
    else:
        # print('Found Title')
        # print(a.index)
        
        b = a.index.astype('int')
        
        # slice the cosine_similiarity matrix for this specific title
        # for cosine_similarity matrix it's "index by index"
        c = cosine_similarity[b]
        d = c.tolist()
        e = list(*d)
        f = list(enumerate(e))
        g = sorted(f, key=lambda x:x[1], reverse=True)
        g = g[1:n+1]
        movies_idx = [x[0] for x in g]
        # print('movies_idx', movies_idx)
        ids = []
        for idx in movies_idx:
            ids.append(md[md.index == idx]['id'].astype('int'))

        return ids

In [10]:
movie_title = 'The Godfather'
ids = get_recommended_movies_by_title(movie_title, n=10)
print('Recommended these movies if you like :', movie_title)
show_movie_list_by_ids(ids)

Recommended these movies if you like : The Godfather
           id                             title release_date
1171      240            The Godfather: Part II   1974-12-20
4300    15745                              Made   2001-07-13
10764   18747                          Election   2005-05-14
11239  119907                  Household Saints   1993-09-15
18206   48153                   The Outside Man   1972-12-21
21458  112205                        The Family   2013-09-13
22952  190955                        Blood Ties   2013-08-22
31693   95892                  Honor Thy Father   1973-03-01
37660  135335          A Mother Should Be Loved   1934-05-11
43540  364150  The Godfather Trilogy: 1972-1990   1992-10-17


In [11]:
movie_title = 'The Dark Knight Rises'
ids = get_recommended_movies_by_title(movie_title, n=10)
print('Recommended these movies if you like :', movie_title)
show_movie_list_by_ids(ids)

Recommended these movies if you like : The Dark Knight Rises
          id                                              title release_date
150      414                                     Batman Forever   1995-06-16
584      268                                             Batman   1989-06-23
1321     364                                     Batman Returns   1992-06-19
3079   14919                       Batman: Mask of the Phantasm   1993-12-25
9181   16234                 Batman Beyond: Return of the Joker   2000-12-12
12421    155                                    The Dark Knight   2008-07-16
15433  40662                         Batman: Under the Red Hood   2010-07-27
21042  29751  Batman Unmasked: The Psychology of the Dark Kn...   2008-07-15
21247  21683                    Batman: Mystery of the Batwoman   2003-10-21
25074  20077                                  Batman vs Dracula   2005-10-18


## Collaborative Filtering

[Surprise - FAQ How to get the top-N recommendations for each user](https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user)

Here, we use a rating dataset to train our model about our end users' taste. We will, then, use this model to predict what a specific user will rate for a given movie. 

The full rating dataset contains about 700M of data, we will only be using a subset of data ~ 2.4M (ratings_small.csv) for illustration purpose. 

In [12]:
# rating_df = pd.read_csv('../input/ratings_small.csv')
rating_df = pd.read_csv('../input/ratings_small.csv')
reader = Reader()
algo = SVD()

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(rating_df[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

The code below will test how our model perform by running cross validation for 5 splits (cv=5). We obtaina RMSE = 0.8957 (mean of 5 splits)

In [13]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9035  0.8849  0.8959  0.9035  0.8906  0.8957  0.0073  
MAE (testset)     0.6921  0.6825  0.6881  0.6950  0.6873  0.6890  0.0043  
Fit time          3.50    3.50    3.55    3.57    3.52    3.53    0.03    
Test time         0.10    0.10    0.10    0.10    0.10    0.10    0.00    


{'test_rmse': array([0.90349556, 0.88492939, 0.89591881, 0.90346045, 0.8905991 ]),
 'test_mae': array([0.69210433, 0.68252377, 0.68812243, 0.69504334, 0.68731463]),
 'fit_time': (3.5006611347198486,
  3.5005249977111816,
  3.5534510612487793,
  3.5653650760650635,
  3.523728132247925),
 'test_time': (0.1020958423614502,
  0.09795880317687988,
  0.10498309135437012,
  0.09901213645935059,
  0.10430312156677246)}

We will run cross-validation once more for a few more test splits and see how the results are

In [14]:
from surprise.model_selection import KFold
from surprise import accuracy

kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9000
RMSE: 0.9065
RMSE: 0.8988
RMSE: 0.8920
RMSE: 0.8933


In [15]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [16]:
trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#    print(uid, [iid for (iid, _) in user_ratings])


In [17]:
top_n[2]

[(1945, 4.424171186363248),
 (1193, 4.392318518716234),
 (858, 4.386289049039193),
 (912, 4.32445304757481),
 (1247, 4.305095587387749),
 (6787, 4.304997678828844),
 (1228, 4.293966092427203),
 (1254, 4.2858653195883205),
 (1230, 4.285599066614404),
 (1259, 4.283160978425719)]

In [18]:
def get_user_recommendations(uid):
    ids = [x for (x, _) in top_n[uid]]
    # we have to translate ids (movieid) back into TM
    return ids

In [19]:
def display_user_history_and_recommendations(uid):
    print("For User: ", uid)
    print("Movies that this user rated before:")
    ids = rating_df[rating_df['userId'] == uid][['movieId']]
    show_movie_list_by_ids(ids['movieId'])

    print("\nMoives that this user may like:")
    ids = get_user_recommendations(uid)
    show_movie_list_by_ids(ids)

In [20]:
display_user_history_and_recommendations(1)

For User:  1
Movies that this user rated before:
         id                           title release_date
2284   1371                       Rocky III   1982-05-28
2577   2105                    American Pie   1999-07-09
3197   2193                        My Tutor   1983-03-04
4580   2294  Jay and Silent Bob Strike Back   2001-08-22
8265   1405                           Greed   1924-12-04
10664  2455            Confidentially Yours   1983-08-10

Moives that this user may like:
       id                     title release_date
533   858      Sleepless in Seattle   1993-06-24
698   922                  Dead Man   1995-12-23
2632  745           The Sixth Sense   1999-08-06
4000  318  The Million Dollar Hotel   2000-02-09


# Hybrid Recommender
In Hybrid Recommeder, we combine both content-based filter and user-based collaborative filtering into another recommender. First, we generate a list of movies that are top rated based on movies title that a user selected. Then, we apply prediction to the set of top rated movies that the user might like to watch by giving predicted rating for each top rated movies. 

In [21]:
def get_user_based_recommedation_by_title(uid, title):
    ids = get_recommended_movies_by_title(title)
    rec_df = md[md.id.isin(ids)][['id', 'title']]
    rec_df['est_rating'] = rec_df['id'].apply(lambda x: algo.predict(uid, x).est)
    rec_df = rec_df.sort_values('est_rating', ascending=False)
    # comment to show estimated rating for the df
    # print(rec_df) 
    return rec_df.id

In [22]:
movie_title = 'Avatar'
uid = 1
print('Based on your ranking history and that you are watching this movie: ')
print(movie_title)
print('\nWe think you might like these: ')
ids = get_user_based_recommedation_by_title(uid,movie_title)
show_movie_list_by_ids(ids)

Based on your ranking history and that you are watching this movie: 
Avatar

We think you might like these: 
         id                                       title release_date
602    8766                       Hellraiser: Bloodline   1996-03-08
2444    603                                  The Matrix   1999-03-30
3057  10384                                   Supernova   2000-01-14
3517  38688             Pandora and the Flying Dutchman   1951-02-15
3633  16096                               House Party 2   1991-10-23
3638  26270                           Project Moon Base   1953-09-04
4590   8922                            Jeepers Creepers   2001-07-01
6044   9567                            Tears of the Sun   2003-03-07
6378   1996  Lara Croft Tomb Raider: The Cradle of Life   2003-07-21
9028  63054                                    Fetishes   1996-09-12
