In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
import seaborn as sns
import numpy as np
import scipy
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display

In [22]:
def coficostfunc( params, Y, R, n_movies, n_features, n_users, lambda_ ):
    X, Theta = unfoldparams( n_movies, n_features, n_users, params );
    J = ( np.sum( ( np.dot( X, Theta.T ) * R - Y * R ) ** 2 ) /2 + 
         np.sum( Theta**2 ) * lambda_ / 2 + np.sum( X **2 ) * lambda_ / 2 )
    grad_X = ( np.dot( ( np.dot( X, Theta.T ) * R - Y * R ), Theta ) + 
              lambda_ * X )
    grad_Theta = ( np.dot( ( np.dot( X, Theta.T ) * R - Y * R ).T, X ) + 
                  lambda_ * Theta )
    grad_params = np.concatenate( 
            (grad_X.flatten(), grad_Theta.flatten()), axis=0 )
    return( J, grad_params );

def normalizeratings2( Y, R ):
    Y_sum = np.sum( Y, axis = 0 )
    R_sum = np.sum( R, axis = 0 )
    Y_mean = Y_sum / R_sum
    Y_norm = np.zeros( Y.shape )
    for i in range( Y.shape[1] ):
        j = np.where( R[:,i] == 1 )
        Y_norm[j,i] = Y[j,i] - Y_mean[i]       
    return( Y_norm, Y_mean );

def initparams( n_movies, n_features, n_users ):
    X = np.random.randn( n_movies, n_features )
    Theta = np.random.randn( n_users, n_features )
    init_params = np.concatenate( (X.flatten(), Theta.flatten()), axis=0 )
    return( init_params );
    
def unfoldparams( n_movies, n_features, n_users, theta ):
    X = theta[0:n_movies * n_features].reshape( n_movies, n_features )
    Theta = theta[n_movies * n_features::].reshape( n_users, n_features )
    return( X, Theta );

def getratingmat( df_rating ):
    df = df_rating.pivot(index='movieId', columns='userId', values='rating')
    df = df.fillna(0)
    df.index.name = 'Movie ID'
    df.columns.name = 'User ID'
    Y = df.as_matrix()
    R = ( Y > 0. ).astype(int)
    print( 'convert rating dataframe to rating matrix for gradient descent')
    return( df, Y, R );
    
def getprediction( df, Y, R ):
    Y_norm, Y_mean = normalizeratings2( Y, R );
    print( "normalize rating matrix: demean each user's ratings" )
    n_movies, n_users = Y.shape
    n_features = 10
    init_params = initparams( n_movies, n_features, n_users )   
    theta_ = scipy.optimize.minimize(
            fun = coficostfunc,
            x0 = init_params,
            args = (Y_norm, R, n_movies, n_features, n_users, 10 ),
            method = 'CG',
            jac = True,
            options = {
                    'maxiter': 150,
                    'disp': False,}).x
    X, Theta = unfoldparams( n_movies, n_features, n_users, theta_ )
    pred = np.dot( X, Theta.T ) + Y_mean
    df_pred = pd.DataFrame( pred, columns = df.columns, index = df.index )
    print( "get predictions for all users")
    return( df_pred );

def getrecommendation( df_pred, df_mov, df_small, userId, top_n = 10):
    df_user = ( df_small.loc[ df_small['userId'] == userId ].
               join(df_mov.set_index('movieId'),on = 'movieId').
               reset_index(drop = True).
               sort_values('rating', ascending=False) )
    display( df_user )
    df_userp = df_pred[ userId ].reset_index()
    df_userp.columns = [ 'movieId', 'prediction' ]
    df_p = ( df_userp[~df_userp['movieId'].isin(df_user['movieId'])].
            join(df_mov.set_index('movieId'), on = 'movieId' ).
            sort_values('prediction', ascending=False) )
    return( df_p.head(top_n) )

In [23]:
InteractiveShell.ast_node_interactivity = "all"
df_mov = pd.read_csv( '../movielens_small/movies.csv' )
df_small = pd.read_csv('../movielens_small/ratings_small.csv').drop('timestamp', axis=1)

In [24]:
df_s = df_small.drop('userId', axis = 1 ).groupby('movieId').mean().reset_index() #avg rating
df_ = df_mov.join( df_s.set_index('movieId'), on = 'movieId' ).reset_index( drop = True ) #movies with avg rating

df_rc = df_small['movieId'].value_counts().reset_index(name = 'counts' ) #movies with count
df_rc.columns = [ 'movieId', 'counts' ] 
df_d = df_.join( df_rc.set_index('movieId'), on = 'movieId', how ='left' ).sort_values( 'movieId' ).reset_index( drop =True ) #movies with avg rating + count
df_d.sort_values('counts', ascending=False).head(10)

Unnamed: 0,movieId,title,genres,rating,counts
321,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.054252,341.0
266,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.256173,324.0
284,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.487138,311.0
525,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.138158,304.0
232,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.221649,291.0
427,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.706204,274.0
2062,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.183398,259.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.87247,247.0
472,527,Schindler's List (1993),Drama|War,4.303279,244.0
522,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,4.006329,237.0


In [25]:
df, Y, R = getratingmat( df_small )
df.head()
df_pred = getprediction( df, Y, R )
df_pred.head()

convert rating dataframe to rating matrix for gradient descent


User ID,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


normalize rating matrix: demean each user's ratings
get predictions for all users


User ID,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.508008,3.511329,3.427701,5.017448,4.025671,3.612187,3.731894,3.790815,3.788692,3.844558,...,3.610801,3.770954,3.929031,4.277545,3.024827,3.811663,3.767158,3.591715,3.796619,4.261071
2,2.515264,3.43764,3.409182,4.170136,3.977432,3.01071,3.199581,3.702913,3.562833,3.619031,...,3.307942,3.731903,3.53339,3.630326,2.791334,3.507256,3.682757,3.252881,3.729901,3.821909
3,2.538818,3.350035,3.449098,3.546733,3.915515,2.833467,3.048889,3.565404,3.491315,3.359646,...,3.060031,3.69006,3.495264,2.779427,2.632362,3.357634,3.561924,3.017693,3.779077,3.634578
4,2.526411,3.374228,3.538041,3.70583,3.952714,2.975731,3.114735,3.740761,3.606668,3.464817,...,3.160578,3.693017,3.535833,2.279294,2.825318,3.406959,3.549192,3.086026,3.729727,3.634251
5,2.526029,3.434012,3.363627,3.794177,4.002728,2.990821,3.130725,3.576563,3.571855,3.513759,...,3.258894,3.708497,3.438216,3.206231,2.726391,3.361473,3.568522,3.111151,3.651395,3.835291


In [26]:
getrecommendation( df_pred, df_mov, df_small, 668 )

Unnamed: 0,userId,movieId,rating,title,genres
0,668,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,668,593,5.0,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
3,668,608,5.0,Fargo (1996),Comedy|Crime|Drama|Thriller
14,668,2997,5.0,Being John Malkovich (1999),Comedy|Drama|Fantasy
13,668,2908,5.0,Boys Don't Cry (1999),Drama
6,668,1213,5.0,Goodfellas (1990),Crime|Drama
7,668,1221,5.0,"Godfather: Part II, The (1974)",Crime|Drama
11,668,2324,5.0,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War
1,668,318,4.0,"Shawshank Redemption, The (1994)",Crime|Drama
9,668,1358,4.0,Sling Blade (1996),Drama


Unnamed: 0,movieId,prediction,title,genres
695,858,4.124746,"Godfather, The (1972)",Crime|Drama
48,50,4.12339,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2288,2858,4.120332,American Beauty (1999),Drama|Romance
2374,2959,4.104872,Fight Club (1999),Action|Crime|Drama|Thriller
642,778,4.094156,Trainspotting (1996),Comedy|Crime|Drama
1352,1704,4.087737,Good Will Hunting (1997),Drama|Romance
2062,2571,4.076818,"Matrix, The (1999)",Action|Sci-Fi|Thriller
3367,4226,4.06319,Memento (2000),Mystery|Thriller
45,47,4.05788,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
5116,7361,4.057051,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
