In [174]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
from scipy.sparse.linalg import svds
from scipy.stats import percentileofscore
from surprise import SVD
from surprise import Dataset
from surprise import Reader

In [175]:
def getpredictions( predictions, userId ):
    user_pred = []
    for uid, iid, true_r, est, _ in predictions:
        if uid == userId :
            user_pred.append( (iid, est) )
    df_userp = pd.DataFrame( user_pred, columns = [ 'movieId', 'rating_pred'] )
    df_userp['movieId'] = df_userp['movieId'].astype(int)
    return( df_userp )

In [176]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')
df_small = pd.read_csv('../movielens_small/ratings_clean.csv')

In [177]:
reader = Reader()
rating_data = Dataset.load_from_df(df_small[['userId', 'movieId', 'rating']], reader)
#rating_data.split(n_folds=5)

In [178]:
algo = SVD( biased = True, n_factors = 60, n_epochs = 30 )
trainset = rating_data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a544f8dd8>

In [179]:
# predict ratings for all pairs (u, i) that are not in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [180]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [181]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x, x]) ) # more weights

In [182]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [183]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [184]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [185]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [186]:
user_id = 50
movie_title = 'Terminator 2: Judgment Day'
df_md_small.loc[ df_md_small['title'] == movie_title, ['title', 'director', 'genres', 'cast'] ]

Unnamed: 0,title,director,genres,cast
522,Terminator 2: Judgment Day,James Cameron,"[Action, Thriller, Science Fiction]","[Arnold Schwarzenegger, Linda Hamilton, Robert..."


In [187]:
df_user_rating = ( df_small.loc[ df_small['userId'] == user_id ].
               merge(df_md_small, on = 'movieId').
               reset_index(drop = True) )

In [188]:
# user's top rated movies
df_user_rating[ ['movieId', 'title', 'rating', 'genres'] ].sort_values('rating', ascending = False ).head(10)

Unnamed: 0,movieId,title,rating,genres
41,589,Terminator 2: Judgment Day,5.0,"[Action, Thriller, Science Fiction]"
0,10,GoldenEye,4.0,"[Adventure, Action, Thriller]"
15,292,Outbreak,4.0,"[Action, Drama, Science Fiction, Thriller]"
44,780,Independence Day,4.0,"[Action, Adventure, Science Fiction]"
37,527,Schindler's List,4.0,"[Drama, History, War]"
35,480,Jurassic Park,4.0,"[Adventure, Science Fiction]"
31,440,Dave,4.0,[Comedy]
26,368,Maverick,4.0,"[Action, Adventure, Comedy, Drama, Western]"
25,367,The Mask,4.0,"[Romance, Comedy, Crime, Fantasy]"
24,357,Four Weddings and a Funeral,4.0,"[Comedy, Drama, Romance]"


In [189]:
df_userp = getpredictions( predictions, user_id )
df_userp.shape

(8979, 2)

In [190]:
# top recommendations using CF
df_userp.sort_values( 'rating_pred', ascending = False ).merge( df_md_small[['movieId', 'title']], on = 'movieId').head(10)

Unnamed: 0,movieId,rating_pred,title
0,1198,4.371471,Raiders of the Lost Ark
1,969,4.340013,The African Queen
2,1217,4.323491,Ran
3,2300,4.320075,The Producers
4,1172,4.310298,Cinema Paradiso
5,1237,4.280328,The Seventh Seal
6,1252,4.259827,Chinatown
7,5782,4.250366,The Professional
8,922,4.238978,Sunset Boulevard
9,2064,4.236367,Roger & Me


In [196]:
idx = np.asscalar( df_md_small.loc[ df_md_small['title'] == movie_title ].index.values[0] )
df_cbp = df_md_small[['movieId', 'title', 'genres']].reset_index( drop = True )
df_cbp[ 'sim' ] = cosine_sim[ idx ].flatten().tolist()
df_cbp.shape

(9082, 4)

In [197]:
# top recommendations using content-based filtering (CBF)
df_cbp.sort_values( 'sim', ascending = False ).head(11)

Unnamed: 0,movieId,title,genres,sim
522,589,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",1.0
990,1240,The Terminator,"[Action, Thriller, Science Fiction]",0.367653
7404,73321,The Book of Eli,"[Action, Thriller, Science Fiction]",0.284029
3671,4691,Def-Con 4,"[Horror, Science Fiction, Thriller, Action]",0.243757
6821,56174,I Am Legend,"[Drama, Horror, Action, Thriller, Science Fict...",0.240048
953,1200,Aliens,"[Horror, Action, Thriller, Science Fiction]",0.235294
2024,2531,Battle for the Planet of the Apes,"[Action, Science Fiction]",0.234023
901,1127,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",0.230812
2430,3032,The Omega Man,"[Action, Science Fiction, Thriller]",0.230205
2951,3702,Mad Max,"[Adventure, Action, Thriller, Science Fiction]",0.222566


In [193]:
df_c = df_userp.merge( df_cbp, on = 'movieId' )
x = df_c['rating_pred'].values
y = df_c['sim'].values
x_p = np.array( [percentileofscore(x, i, 'mean') for i in x] )
y_p = np.array( [percentileofscore(y, i, 'mean') for i in y] )
df_c['rating_p'] = x_p.tolist()
df_c['sim_p'] = y_p.tolist()

In [200]:
weight_cb = 0.6
score = x_p * ( 1 - weight_cb ) + y_p * weight_cb
df_c['score'] = score.tolist()

In [210]:
# top recommendations from hybrid recommendation engine
df_c.sort_values( 'score', ascending = False ).iloc[0:5][ ['title'] ].merge( df_md_small[['title', 'director', 'genres', 'cast']], on = 'title').reset_index( drop = True )

Unnamed: 0,title,director,genres,cast
0,Delicatessen,Jean-Pierre Jeunet,"[Comedy, Science Fiction, Fantasy]","[Dominique Pinon, Marie-Laure Dougnac, Jean-Cl..."
1,The Terminator,James Cameron,"[Action, Thriller, Science Fiction]","[Arnold Schwarzenegger, Michael Biehn, Linda H..."
2,Serenity,Joss Whedon,"[Science Fiction, Action, Adventure, Thriller]","[Nathan Fillion, Gina Torres, Alan Tudyk, More..."
3,Star Trek: First Contact,Jonathan Frakes,"[Science Fiction, Action, Adventure, Thriller]","[Patrick Stewart, Jonathan Frakes, Brent Spine..."
4,Mad Max 2: The Road Warrior,George Miller,"[Adventure, Action, Thriller, Science Fiction]","[Mel Gibson, Bruce Spence, Michael Preston, Ma..."


In [213]:
# explanation for top recommendations from hybrid recommendation engine
df_c.sort_values( 'score', ascending = False ).iloc[0:5][['title', 'rating_pred', 'sim', 'rating_p', 'sim_p', 'score']].reset_index( drop = True )

Unnamed: 0,title,rating_pred,sim,rating_p,sim_p,score
0,Delicatessen,3.945775,0.121578,98.412964,98.496492,98.463081
1,The Terminator,3.802743,0.367653,95.405947,99.994431,98.159038
2,Serenity,3.886605,0.117647,97.432899,98.245907,97.920704
3,Star Trek: First Contact,3.851929,0.115406,96.720125,98.140105,97.572113
4,Mad Max 2: The Road Warrior,3.735432,0.197446,93.122842,99.849649,97.158926
