In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
from scipy.sparse.linalg import svds
from scipy.stats import percentileofscore
from surprise import SVD
from surprise import Dataset
from surprise import Reader

In [2]:
def getpredictions( predictions, userId ):
    user_pred = []
    for uid, iid, true_r, est, _ in predictions:
        if uid == userId :
            user_pred.append( (iid, est) )
    df_userp = pd.DataFrame( user_pred, columns = [ 'movieId', 'rating_pred'] )
    df_userp['movieId'] = df_userp['movieId'].astype(int)
    return( df_userp )

In [3]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')
df_small = pd.read_csv('../movielens_small/ratings_clean.csv')

In [4]:
reader = Reader()
rating_data = Dataset.load_from_df(df_small[['userId', 'movieId', 'rating']], reader)
#rating_data.split(n_folds=5)

In [5]:
algo = SVD( biased = True, n_factors = 100, n_epochs = 25 )
trainset = rating_data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17f778849b0>

In [6]:
# predict ratings for all pairs (u, i) that are not in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [7]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [8]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x, x]) ) # more weights

In [9]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [10]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [11]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [12]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [13]:
user_id = 50
movie_title = 'Terminator 2: Judgment Day'
df_md_small.loc[ df_md_small['title'] == movie_title, ['title', 'director', 'genres', 'cast'] ]

Unnamed: 0,title,director,genres,cast
522,Terminator 2: Judgment Day,James Cameron,"[Action, Thriller, Science Fiction]","[Arnold Schwarzenegger, Linda Hamilton, Robert..."


In [14]:
df_user_rating = ( df_small.loc[ df_small['userId'] == user_id ].
               merge(df_md_small, on = 'movieId').
               reset_index(drop = True) )

In [15]:
# user's top rated movies
df_user_rating[ ['movieId', 'title', 'rating', 'genres'] ].sort_values('rating', ascending = False ).head(10)

Unnamed: 0,movieId,title,rating,genres
41,589,Terminator 2: Judgment Day,5.0,"[Action, Thriller, Science Fiction]"
0,10,GoldenEye,4.0,"[Adventure, Action, Thriller]"
15,292,Outbreak,4.0,"[Action, Drama, Science Fiction, Thriller]"
44,780,Independence Day,4.0,"[Action, Adventure, Science Fiction]"
37,527,Schindler's List,4.0,"[Drama, History, War]"
35,480,Jurassic Park,4.0,"[Adventure, Science Fiction]"
31,440,Dave,4.0,[Comedy]
26,368,Maverick,4.0,"[Action, Adventure, Comedy, Drama, Western]"
25,367,The Mask,4.0,"[Romance, Comedy, Crime, Fantasy]"
24,357,Four Weddings and a Funeral,4.0,"[Comedy, Drama, Romance]"


In [16]:
df_userp = getpredictions( predictions, user_id )
df_userp.shape

(8979, 2)

In [17]:
# top recommendations using CF
df_userp.sort_values( 'rating_pred', ascending = False ).merge( df_md_small[['movieId', 'title']], on = 'movieId').head(10)

Unnamed: 0,movieId,rating_pred,title
0,2064,4.313673,Roger & Me
1,318,4.313477,The Shawshank Redemption
2,904,4.272236,Rear Window
3,1247,4.266625,The Graduate
4,1212,4.232272,The Third Man
5,50,4.217932,The Usual Suspects
6,194,4.17439,Smoke
7,2571,4.170977,The Matrix
8,969,4.170501,The African Queen
9,3435,4.164166,Double Indemnity


In [18]:
idx = np.asscalar( df_md_small.loc[ df_md_small['title'] == movie_title ].index.values[0] )
df_cbp = df_md_small[['movieId', 'title', 'genres']].reset_index( drop = True )
df_cbp[ 'sim' ] = cosine_sim[ idx ].flatten().tolist()
df_cbp.shape

(9082, 4)

In [19]:
# top recommendations using content-based filtering (CBF)
df_cbp.sort_values( 'sim', ascending = False ).head(11)

Unnamed: 0,movieId,title,genres,sim
522,589,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",1.0
990,1240,The Terminator,"[Action, Thriller, Science Fiction]",0.367653
7404,73321,The Book of Eli,"[Action, Thriller, Science Fiction]",0.284029
3671,4691,Def-Con 4,"[Horror, Science Fiction, Thriller, Action]",0.243757
6821,56174,I Am Legend,"[Drama, Horror, Action, Thriller, Science Fict...",0.240048
953,1200,Aliens,"[Horror, Action, Thriller, Science Fiction]",0.235294
2024,2531,Battle for the Planet of the Apes,"[Action, Science Fiction]",0.234023
901,1127,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",0.230812
2430,3032,The Omega Man,"[Action, Science Fiction, Thriller]",0.230205
2951,3702,Mad Max,"[Adventure, Action, Thriller, Science Fiction]",0.222566


In [20]:
df_c = df_userp.merge( df_cbp, on = 'movieId' )
x = df_c['rating_pred'].values
y = df_c['sim'].values
x_p = np.array( [percentileofscore(x, i, 'mean') for i in x] )
y_p = np.array( [percentileofscore(y, i, 'mean') for i in y] )
df_c['rating_p'] = x_p.tolist()
df_c['sim_p'] = y_p.tolist()

In [21]:
weight_cb = 0.5
score = x_p * ( 1 - weight_cb ) + y_p * weight_cb
df_c['score'] = score.tolist()

In [22]:
# top recommendations from hybrid recommendation engine
df_c.sort_values( 'score', ascending = False ).iloc[0:10][ ['title'] ].merge( df_md_small[['title', 'director', 'genres', 'cast']], on = 'title').reset_index( drop = True )

Unnamed: 0,title,director,genres,cast
0,Inception,Christopher Nolan,"[Action, Thriller, Science Fiction, Mystery, A...","[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle..."
1,Cube,Vincenzo Natali,"[Thriller, Science Fiction, Mystery]","[Nicole de Boer, Nicky Guadagni, David Hewlett..."
2,Mad Max 2: The Road Warrior,George Miller,"[Adventure, Action, Thriller, Science Fiction]","[Mel Gibson, Bruce Spence, Michael Preston, Ma..."
3,The Fifth Element,Luc Besson,"[Adventure, Fantasy, Action, Thriller, Science...","[Bruce Willis, Gary Oldman, Ian Holm, Milla Jo..."
4,Twelve Monkeys,Terry Gilliam,"[Science Fiction, Thriller, Mystery]","[Bruce Willis, Madeleine Stowe, Brad Pitt, Chr..."
5,Ghost in the Shell,Mamoru Oshii,"[Action, Animation, Science Fiction]","[Atsuko Tanaka, Akio Ohtsuka, Iemasa Kayumi, K..."
6,The Terminator,James Cameron,"[Action, Thriller, Science Fiction]","[Arnold Schwarzenegger, Michael Biehn, Linda H..."
7,Minority Report,Steven Spielberg,"[Action, Thriller, Science Fiction, Mystery]","[Tom Cruise, Colin Farrell, Samantha Morton, M..."
8,The Matrix,Lana Wachowski,"[Action, Science Fiction]","[Keanu Reeves, Laurence Fishburne, Carrie-Anne..."
9,Delicatessen,Jean-Pierre Jeunet,"[Comedy, Science Fiction, Fantasy]","[Dominique Pinon, Marie-Laure Dougnac, Jean-Cl..."


In [23]:
# explanation for top recommendations from hybrid recommendation engine
df_c.sort_values( 'score', ascending = False ).iloc[0:10][['title', 'rating_pred', 'sim', 'rating_p', 'sim_p', 'score']].reset_index( drop = True )

Unnamed: 0,title,rating_pred,sim,rating_p,sim_p,score
0,Inception,3.968511,0.146119,99.025504,99.181423,99.103464
1,Cube,4.051966,0.122551,99.593496,98.596726,99.095111
2,Mad Max 2: The Road Warrior,3.89046,0.197446,98.034302,99.849649,98.941976
3,The Fifth Element,3.911823,0.126035,98.390689,98.752645,98.571667
4,Twelve Monkeys,3.824692,0.187867,96.697851,99.716004,98.206927
5,Ghost in the Shell,3.867393,0.125245,97.566544,98.624568,98.095556
6,The Terminator,3.793232,0.367653,96.107584,99.994431,98.051008
7,Minority Report,3.815097,0.163401,96.564205,99.509968,98.037087
8,The Matrix,4.170977,0.094407,99.916472,95.912685,97.914578
9,Delicatessen,3.821678,0.121578,96.653302,98.496492,97.574897
