In [14]:
import pandas as pd
from collections import defaultdict
from surprise import SVD
from surprise import Dataset
from surprise import Reader
import ast
from surprise.model_selection import cross_validate

In [15]:
def getprediction( predictions, userId ):
    user_pred = []
    for uid, iid, true_r, est, _ in predictions:
        if uid == userId :
            user_pred.append( (iid, est) )
    df_userp = pd.DataFrame( user_pred, columns = [ 'movieId', 'prediction'] )
    df_userp['movieId'] = df_userp['movieId'].astype(int)
    return( df_userp )

In [16]:
#load data
df_small = pd.read_csv('../movielens_small/ratings_clean.csv')
df_md_small = pd.read_csv( '../movielens_small/metadata_clean.csv' )
#df_small.head()

In [17]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [18]:
reader = Reader()
rating_data = Dataset.load_from_df(df_small[['userId', 'movieId', 'rating']], reader)

In [25]:
algo = SVD( biased = True, n_factors = 100, n_epochs = 30 )
# Run 5-fold cross-validation and print results.
cross_validate(algo, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9014  0.9020  0.8865  0.8984  0.9039  0.8985  0.0062  
MAE (testset)     0.6960  0.6937  0.6826  0.6936  0.6946  0.6921  0.0048  
Fit time          19.81   20.26   20.38   20.42   10.24   18.22   4.00    
Test time         0.59    0.46    0.59    0.33    0.15    0.42    0.17    


{'fit_time': (19.80711007118225,
  20.261728048324585,
  20.38376808166504,
  20.423498392105103,
  10.242048025131226),
 'test_mae': array([ 0.69600512,  0.69365111,  0.68257576,  0.69357652,  0.69456018]),
 'test_rmse': array([ 0.90137827,  0.9020386 ,  0.88651692,  0.89837282,  0.90394924]),
 'test_time': (0.5885262489318848,
  0.4564390182495117,
  0.5860190391540527,
  0.3281216621398926,
  0.1526811122894287)}

In [20]:
# build trainset
trainset = rating_data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10dc32e80>

In [21]:
# predict ratings for all pairs (u, i) that are not in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [22]:
user_id = 50

In [23]:
df_userp = getprediction( predictions, user_id )
#df_userp.head()
df_userp = ( df_userp. join(df_md_small.set_index('movieId'), on = 'movieId' ).
            sort_values('prediction', ascending=False) )
df_userp.head(10)

Unnamed: 0,movieId,prediction,index,tmdbId,title,vote_average,vote_count,cast,director,keywords,genres
696,922,4.441198,740,599,Sunset Boulevard,8.2,533,"[William Holden, Gloria Swanson, Erich von Str...",Billy Wilder,"[new year's eve, screenwriter, jealousy, butle...",[Drama]
994,3462,4.420965,2759,3082,Modern Times,8.1,881,"[Charlie Chaplin, Paulette Goddard, Henry Berg...",Charlie Chaplin,"[factory, ambulance, invention, tramp, great d...","[Drama, Comedy]"
4,1172,4.412912,927,11216,Cinema Paradiso,8.2,834,"[Philippe Noiret, Jacques Perrin, Marco Leonar...",Giuseppe Tornatore,"[sicily, cinema, film director, kiss, coming o...","[Drama, Romance]"
1502,57669,4.385885,6864,8321,In Bruges,7.4,1414,"[Colin Farrell, Brendan Gleeson, Ralph Fiennes...",Martin McDonagh,"[bruges belgium, town square, vietnamese, cana...","[Comedy, Drama, Crime]"
77,1197,4.365669,950,2493,The Princess Bride,7.6,1518,"[Cary Elwes, Robin Wright, Mandy Patinkin, And...",Rob Reiner,"[swashbuckler, evil prince, reference to socra...","[Adventure, Family, Fantasy, Comedy, Romance]"
389,534,4.364211,478,10445,Shadowlands,6.8,61,"[Anthony Hopkins, Debra Winger, John Wood, Jul...",Richard Attenborough,"[bachelor, stroke of fate, brother, oxford, au...","[Drama, Romance]"
729,1217,4.33882,969,11645,Ran,7.9,349,"[Akira Terao, Jinpachi Nezu, Daisuke Ryû, Tats...",Akira Kurosawa,"[assassination, gun, castle, power, kingdom, g...","[Action, Drama, History]"
71,318,4.332526,284,278,The Shawshank Redemption,8.5,8358,"[Tim Robbins, Morgan Freeman, Bob Gunton, Clan...",Frank Darabont,"[prison, corruption, police brutality, prison ...","[Drama, Crime]"
740,1254,4.332362,1004,3090,The Treasure of the Sierra Madre,7.9,285,"[Humphrey Bogart, Walter Huston, Tim Holt, Bru...",John Huston,"[mexico, gold, gold rush, greed, friends, mone...","[Action, Adventure, Drama, Western]"
1499,56782,4.330872,6837,7345,There Will Be Blood,7.9,1581,"[Daniel Day-Lewis, Paul Dano, Dillon Freasier,...",Paul Thomas Anderson,"[brother brother relationship, deaf-mute, amer...",[Drama]


In [24]:
df_user = ( df_small.loc[ df_small['userId'] == user_id ].
           merge(df_md_small,on = 'movieId').
           sort_values('rating', ascending=False) )
df_user[ ['userId', 'title', 'rating', 'genres', 'vote_average', 'cast', 'director'] ]

Unnamed: 0,userId,title,rating,genres,vote_average,cast,director
41,50,Terminator 2: Judgment Day,5.0,"[Action, Thriller, Science Fiction]",7.7,"[Arnold Schwarzenegger, Linda Hamilton, Robert...",James Cameron
0,50,GoldenEye,4.0,"[Adventure, Action, Thriller]",6.6,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...",Martin Campbell
15,50,Outbreak,4.0,"[Action, Drama, Science Fiction, Thriller]",6.4,"[Dustin Hoffman, Rene Russo, Morgan Freeman, K...",Wolfgang Petersen
44,50,Independence Day,4.0,"[Action, Adventure, Science Fiction]",6.7,"[Will Smith, Bill Pullman, Jeff Goldblum, Mary...",Roland Emmerich
37,50,Schindler's List,4.0,"[Drama, History, War]",8.3,"[Liam Neeson, Ben Kingsley, Ralph Fiennes, Car...",Steven Spielberg
35,50,Jurassic Park,4.0,"[Adventure, Science Fiction]",7.6,"[Sam Neill, Laura Dern, Jeff Goldblum, Richard...",Steven Spielberg
31,50,Dave,4.0,[Comedy],6.3,"[Kevin Kline, Sigourney Weaver, Frank Langella...",Ivan Reitman
26,50,Maverick,4.0,"[Action, Adventure, Comedy, Drama, Western]",6.6,"[Mel Gibson, Jodie Foster, James Garner, Graha...",Richard Donner
25,50,The Mask,4.0,"[Romance, Comedy, Crime, Fantasy]",6.6,"[Jim Carrey, Cameron Diaz, Nancy Fish, Tim Bag...",Chuck Russell
24,50,Four Weddings and a Funeral,4.0,"[Comedy, Drama, Romance]",6.6,"[Hugh Grant, Andie MacDowell, James Fleet, Sim...",Mike Newell
