# Example of SGD

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [2]:
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
              [np.NaN,np.NaN,3,4,4],
              [5,2,1,2,np.NaN]])

num_users, num_items = R.shape

#latent dimension k
k = 3

In [3]:
np.random.seed(123)
# Make P,Q with number with nurmal distribution
P = np.random.normal(scale =1./k, size =(num_users, k))
Q = np.random.normal(scale = 1./k, size =(num_items, k))

In [4]:
P, Q

(array([[-0.36187687,  0.33244848,  0.09432617],
        [-0.50209824, -0.19286675,  0.55047885],
        [-0.80889308, -0.14297088,  0.42197875],
        [-0.28891347, -0.22629538, -0.03156966]]),
 array([[ 0.49712988, -0.21296733, -0.14799399],
        [-0.14478376,  0.73531003,  0.7289287 ],
        [ 0.33468463,  0.1287288 ,  0.24578953],
        [ 0.49691068, -0.31194462,  0.39194301],
        [-0.41796022, -0.21258383,  0.3023684 ]]))

In [5]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [6]:
non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

#Update P and Q with SGD
for step in range(steps):
    for i,j,r in non_zeros:
        #error rate
        eij = r - np.dot(P[i,:],Q[j,:].T)
        #SGD Update Formula
        P[i, :] = P[i,:] + learning_rate * (eij * Q[j, :] - r_lambda * P[i,:])
        Q[j, :] = Q[j,:] + learning_rate * (eij * P[i, :] - r_lambda * Q[j,:])
        
        rmse = get_rmse(R,P,Q, non_zeros)
        
        if(step & 50) == 0:
            print(" ### iteration step : ",step," rmse : ", rmse)      
        
        


 ### iteration step :  0  rmse :  3.292761754051721
 ### iteration step :  0  rmse :  3.2903519905523835
 ### iteration step :  0  rmse :  3.2804684158967214
 ### iteration step :  0  rmse :  3.2753491076991317
 ### iteration step :  0  rmse :  3.274837516800426
 ### iteration step :  0  rmse :  3.271483242401243
 ### iteration step :  0  rmse :  3.261772717128121
 ### iteration step :  0  rmse :  3.2579224201361874
 ### iteration step :  0  rmse :  3.2553214290300834
 ### iteration step :  0  rmse :  3.2542384590774844
 ### iteration step :  0  rmse :  3.253546788837113
 ### iteration step :  0  rmse :  3.25142882346843
 ### iteration step :  1  rmse :  3.2486495908454742
 ### iteration step :  1  rmse :  3.246530690293261
 ### iteration step :  1  rmse :  3.236353002732379
 ### iteration step :  1  rmse :  3.2311086827740936
 ### iteration step :  1  rmse :  3.2305745983690883
 ### iteration step :  1  rmse :  3.2273034303344033
 ### iteration step :  1  rmse :  3.2177402438575102
 #

 ### iteration step :  128  rmse :  0.058610094968538196
 ### iteration step :  128  rmse :  0.05858543470231438
 ### iteration step :  128  rmse :  0.058640786458722935
 ### iteration step :  128  rmse :  0.0581007846162997
 ### iteration step :  128  rmse :  0.057930092003010906
 ### iteration step :  128  rmse :  0.057832682355831284
 ### iteration step :  128  rmse :  0.05799960464539975
 ### iteration step :  128  rmse :  0.057751243571134134
 ### iteration step :  128  rmse :  0.05749545304602571
 ### iteration step :  128  rmse :  0.05745217444708837
 ### iteration step :  128  rmse :  0.05735567291821021
 ### iteration step :  128  rmse :  0.057461757027650655
 ### iteration step :  129  rmse :  0.05752532926113044
 ### iteration step :  129  rmse :  0.05749872388625558
 ### iteration step :  129  rmse :  0.05755415375859132
 ### iteration step :  129  rmse :  0.05701745250306905
 ### iteration step :  129  rmse :  0.0568573345513196
 ### iteration step :  129  rmse :  0.056760

 ### iteration step :  256  rmse :  0.016814527025732006
 ### iteration step :  256  rmse :  0.016754299072255372
 ### iteration step :  256  rmse :  0.01663573077132597
 ### iteration step :  256  rmse :  0.01657547473815576
 ### iteration step :  256  rmse :  0.01670469300453075
 ### iteration step :  256  rmse :  0.016695573206698552
 ### iteration step :  256  rmse :  0.0166741540704795
 ### iteration step :  256  rmse :  0.016582866894575478
 ### iteration step :  256  rmse :  0.01624410483749267
 ### iteration step :  256  rmse :  0.016334642857543786
 ### iteration step :  256  rmse :  0.016437267423267534
 ### iteration step :  256  rmse :  0.016668397196515857
 ### iteration step :  257  rmse :  0.01677009542586922
 ### iteration step :  257  rmse :  0.016710534809214292
 ### iteration step :  257  rmse :  0.01659079279402862
 ### iteration step :  257  rmse :  0.01653222367410574
 ### iteration step :  257  rmse :  0.016661680786541563
 ### iteration step :  257  rmse :  0.01

 ### iteration step :  384  rmse :  0.014552132312848557
 ### iteration step :  384  rmse :  0.014561746468621984
 ### iteration step :  384  rmse :  0.014377804787767904
 ### iteration step :  384  rmse :  0.014395039459488841
 ### iteration step :  384  rmse :  0.014538455802555996
 ### iteration step :  384  rmse :  0.014555542257764568
 ### iteration step :  384  rmse :  0.01445181575579019
 ### iteration step :  384  rmse :  0.014371358268236106
 ### iteration step :  384  rmse :  0.014041415146174416
 ### iteration step :  384  rmse :  0.014180240093520632
 ### iteration step :  384  rmse :  0.014286574656911268
 ### iteration step :  384  rmse :  0.01447221797183003
 ### iteration step :  385  rmse :  0.014546055153790418
 ### iteration step :  385  rmse :  0.014556043937855749
 ### iteration step :  385  rmse :  0.014371951615926469
 ### iteration step :  385  rmse :  0.0143892898261263
 ### iteration step :  385  rmse :  0.014532757353528176
 ### iteration step :  385  rmse : 

 ### iteration step :  512  rmse :  0.014140655850358948
 ### iteration step :  512  rmse :  0.014182358150643051
 ### iteration step :  512  rmse :  0.013990029321652284
 ### iteration step :  512  rmse :  0.014010342959094441
 ### iteration step :  512  rmse :  0.014157445429049112
 ### iteration step :  512  rmse :  0.014176579970720528
 ### iteration step :  512  rmse :  0.01406170020926446
 ### iteration step :  512  rmse :  0.01398166280236316
 ### iteration step :  512  rmse :  0.013667345157560956
 ### iteration step :  512  rmse :  0.013814737359910189
 ### iteration step :  512  rmse :  0.013923646955793737
 ### iteration step :  512  rmse :  0.014081789976276621
 ### iteration step :  513  rmse :  0.014139139300290107
 ### iteration step :  513  rmse :  0.014180995992050985
 ### iteration step :  513  rmse :  0.013988645993729959
 ### iteration step :  513  rmse :  0.014008952140648578
 ### iteration step :  513  rmse :  0.014156068261287617
 ### iteration step :  513  rmse 

 ### iteration step :  640  rmse :  0.01402221373149739
 ### iteration step :  640  rmse :  0.014076933991001064
 ### iteration step :  640  rmse :  0.0138833909680615
 ### iteration step :  640  rmse :  0.013902569673965052
 ### iteration step :  640  rmse :  0.014050613571384598
 ### iteration step :  640  rmse :  0.01406979272750838
 ### iteration step :  640  rmse :  0.01395183437647337
 ### iteration step :  640  rmse :  0.01387227339521762
 ### iteration step :  640  rmse :  0.01356598870522174
 ### iteration step :  640  rmse :  0.013715416520621875
 ### iteration step :  640  rmse :  0.013825368729383641
 ### iteration step :  640  rmse :  0.01397158238816722
 ### iteration step :  641  rmse :  0.014021664076610745
 ### iteration step :  641  rmse :  0.014076446484254839
 ### iteration step :  641  rmse :  0.0138829006663596
 ### iteration step :  641  rmse :  0.013902071694408623
 ### iteration step :  641  rmse :  0.014050118758242235
 ### iteration step :  641  rmse :  0.014

 ### iteration step :  768  rmse :  0.013971183157979173
 ### iteration step :  768  rmse :  0.014031170913561439
 ### iteration step :  768  rmse :  0.013837555235062885
 ### iteration step :  768  rmse :  0.013855989843405067
 ### iteration step :  768  rmse :  0.014004210397777545
 ### iteration step :  768  rmse :  0.014023439348537644
 ### iteration step :  768  rmse :  0.013903690027252395
 ### iteration step :  768  rmse :  0.013824720683483381
 ### iteration step :  768  rmse :  0.013522066797423916
 ### iteration step :  768  rmse :  0.013671985562459863
 ### iteration step :  768  rmse :  0.013782231036154615
 ### iteration step :  768  rmse :  0.013923847842948556
 ### iteration step :  769  rmse :  0.01397088715203741
 ### iteration step :  769  rmse :  0.014030900377743596
 ### iteration step :  769  rmse :  0.013837285537572732
 ### iteration step :  769  rmse :  0.013855715912942112
 ### iteration step :  769  rmse :  0.014003936622732208
 ### iteration step :  769  rmse

 ### iteration step :  896  rmse :  0.0139393659703517
 ### iteration step :  896  rmse :  0.014001555288997781
 ### iteration step :  896  rmse :  0.013808116590129933
 ### iteration step :  896  rmse :  0.013826130646235842
 ### iteration step :  896  rmse :  0.013974300258568963
 ### iteration step :  896  rmse :  0.013993665893521832
 ### iteration step :  896  rmse :  0.013872485769964785
 ### iteration step :  896  rmse :  0.013794113564869728
 ### iteration step :  896  rmse :  0.013493122490742429
 ### iteration step :  896  rmse :  0.013643057969317031
 ### iteration step :  896  rmse :  0.013753306212829695
 ### iteration step :  896  rmse :  0.01389339999324753
 ### iteration step :  897  rmse :  0.013939151666408701
 ### iteration step :  897  rmse :  0.014001352021736604
 ### iteration step :  897  rmse :  0.013807915055739053
 ### iteration step :  897  rmse :  0.013825926554671401
 ### iteration step :  897  rmse :  0.01397409538386482
 ### iteration step :  897  rmse : 

In [7]:
pred_matrix = np.dot(P,Q.T)
print('predicted matrix:\n',np.round(pred_matrix, 3))

predicted matrix:
 [[3.992 2.339 1.159 1.996 1.308]
 [3.471 4.977 2.002 2.99  1.01 ]
 [5.057 4.763 2.988 3.983 3.982]
 [4.974 2.    1.004 2.    1.325]]


# TMDB Movie Dataset

In [8]:
import warnings; warnings.filterwarnings('ignore')

### Using Genre for Content Based Filtering 

In [9]:
movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [10]:
movies_df = movies[['id', 'title', 'genres', 'vote_average','vote_count',
                   'popularity','keywords','overview']]

In [26]:
movies_df.head(1)

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal,weighted_vote
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...",Action Adventure Fantasy Science Fiction,7.166301


In [11]:
pd.set_option('max_colwidth', 100)
movies_df[['genres','keywords']][:2]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is..."


In [12]:
#convert genres to list
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [13]:
movies_df['genres'][:1], movies_df['keywords'][:1]

(0    [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...
 Name: genres, dtype: object,
 0    [{'id': 1463, 'name': 'culture clash'}, {'id': 2964, 'name': 'future'}, {'id': 3386, 'name': 'sp...
 Name: keywords, dtype: object)

In [14]:
#extract only genre name
movies_df['genres'] = movies_df['genres'].apply(lambda x: [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [y['name'] for y in x])


In [15]:
movies_df[['genres','keywords']].head(1)

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


In [16]:
# Content Similarity Using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df = 0, ngram_range = (1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [40]:

vars(genre_mat)

{'_shape': (4803, 276),
 'maxprint': 50,
 'indices': array([  0,   1,  16, ..., 250, 251,  80]),
 'indptr': array([    0,     9,    14, ..., 20630, 20630, 20631]),
 'data': array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 '_has_canonical_format': True,
 '_has_sorted_indices': True}

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
#represent genre's similarity each other
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
genre_sim
temp = pd.DataFrame(genre_sim)
temp

(4803, 4803)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
0,1.000000,0.596285,0.447214,0.125988,0.755929,0.596285,0.0,0.755929,0.447214,0.745356,...,0.000000,0.000000,0.000000,0.377964,0.000000,0.149071,0.0000,0.000000,0.0,0.0
1,0.596285,1.000000,0.400000,0.169031,0.338062,0.800000,0.0,0.338062,0.600000,0.800000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.0000,0.000000,0.0,0.0
2,0.447214,0.400000,1.000000,0.338062,0.507093,0.600000,0.0,0.507093,0.200000,0.600000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.400000,0.0000,0.000000,0.0,0.0
3,0.125988,0.169031,0.338062,1.000000,0.142857,0.169031,0.0,0.142857,0.000000,0.169031,...,0.377964,0.169031,0.377964,0.428571,0.218218,0.676123,0.0000,0.125988,0.0,0.0
4,0.755929,0.338062,0.507093,0.142857,1.000000,0.507093,0.0,1.000000,0.169031,0.507093,...,0.000000,0.000000,0.000000,0.428571,0.000000,0.169031,0.0000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.149071,0.200000,0.400000,0.676123,0.169031,0.200000,0.0,0.169031,0.000000,0.200000,...,0.000000,0.200000,0.000000,0.169031,0.258199,1.000000,0.0000,0.000000,0.0,0.0
4799,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.258199,0.000000,0.000000,0.000000,0.000000,1.0000,0.384900,0.0,0.0
4800,0.000000,0.000000,0.000000,0.125988,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.333333,0.149071,0.333333,0.125988,0.000000,0.000000,0.3849,1.000000,0.0,0.0
4801,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0


In [18]:
#use argsort to find index of higher similarity row

genre_sim_sorted_idx = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_idx[:2]

array([[   0, 3494,  813, ..., 3038, 3037, 2401],
       [ 262,    1,  129, ..., 3069, 3067, 2401]], dtype=int64)

In [19]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    
    title_idx = title_movie.index.values
    similar_indexes = sorted_idx[title_idx, :top_n]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [43]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average']][:10]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [21]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending = False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [22]:
"""
USE WEIGHTED RATING USED BY IMDB
v : vote_count
m : minimum_count
R : vote_average of each movie
C : average rating of total movie rating
"""

C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6) # top 60%
round(C, 3) , round(m,3)

(6.092, 370.2)

In [23]:
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/(v+m)) * R) + ((m/(m+v)) * C )

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [24]:
movies_df.columns

Index(['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity',
       'keywords', 'overview', 'genres_literal', 'weighted_vote'],
      dtype='object')

In [25]:
movies[['title','vote_average','vote_count','weighted_vote']].sort_values('weighted_vote', ascending = False)

KeyError: "['weighted_vote'] not in index"

In [None]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    title_idx = title_movie.index.values
    
    similar_indexes = sorted_idx[title_idx, :top_n*2]
    similar_indexes = similar_indexes.reshape(-1)
    #delete input title
    similar_indexes = similar_indexes[similar_indexes != title_idx]
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average','weighted_vote']]

## Item Based Collaborative Filtering

In [None]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
movies.shape , ratings.shape

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')
ratings_matrix.head(3)

In [None]:
rating_movies = pd.merge(ratings, movies, on ='movieId')

ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')

ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(3)

In [None]:
#Similarity between Movies

ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head(3)

In [None]:
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

In [None]:
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)[:10]

In [None]:
item_sim_df['Inception (2010)'].sort_values(ascending=False)[:10]

In [None]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

In [None]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

get_mse(ratings_pred, ratings_matrix.values)

In [None]:
# predic_rating uses all movies similarities which makes accuracy lower
# make function to use movies have higher similarities

def predict_rating_topsim(ratings_arr, item_sim_arr, n =20):
    
    pred = np.zeros(ratings_arr.shape)
    
    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:,col])[:-n-1:-1]]
        
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
            
    return pred

In [None]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n =20)

In [None]:
#Top 20 mse
print(get_mse(ratings_pred, ratings_matrix.values))

ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

In [None]:
ratings_matrix.shape[1]

In [None]:
user_rating_id = ratings_matrix.iloc[9,:]
user_rating_id[user_rating_id>0].sort_values(ascending=False)[:10]

In [None]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

## Latent Collaborative Filtering Using Matrix Factorization (SGD)

In [50]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
#     print(R_non_zeros)
#     print(full_pred_matrix_non_zeros)
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [51]:
def matrix_factorization(R, K, steps=2000, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    
    np.random.seed(123)
    P = np.random.normal(scale = 1./K, size=(num_users, K))
    Q = np.random.normal(scale = 1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_point = 0
    
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

    
    #SGD
    for step in range(steps):
        for i, j , r in non_zeros:
            
            eij = r - np.dot(P[i, :], Q[j,:].T)
            
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if(step % 10) == 0:
            print("### iteration step : ",step," rmse : ", rmse)
    
    return P,Q    

non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

      


In [52]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')

In [53]:
ratings_matrix.head(1)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,


In [54]:
rating_movies = pd.merge(ratings, movies, on ='movieId')
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns ='title')
ratings_matrix.head(1)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,


In [58]:
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [32]:
P,Q = matrix_factorization(ratings_matrix.values, K=50, steps =1000, learning_rate = 0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  2.8999710981870277
### iteration step :  10  rmse :  0.7320316682640189
### iteration step :  20  rmse :  0.5128554766540852
### iteration step :  30  rmse :  0.3751619108487809
### iteration step :  40  rmse :  0.2973798350241291
### iteration step :  50  rmse :  0.25247463220316346
### iteration step :  60  rmse :  0.2247991891606596
### iteration step :  70  rmse :  0.20644753404894764
### iteration step :  80  rmse :  0.19350207891182367
### iteration step :  90  rmse :  0.18390784443443653
### iteration step :  100  rmse :  0.1765169145426336
### iteration step :  110  rmse :  0.1706482769066681
### iteration step :  120  rmse :  0.16587525034925493
### iteration step :  130  rmse :  0.16191758779812465
### iteration step :  140  rmse :  0.15858371524981202
### iteration step :  150  rmse :  0.1557381730694126
### iteration step :  160  rmse :  0.1532823839688138
### iteration step :  170  rmse :  0.1511428156375125
### iteration step :  180  rmse 

In [60]:
pd.DataFrame(Q)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.036618,0.416438,-0.141054,0.435128,0.112433,-0.342097,0.091229,-0.578819,-0.061172,0.102669,...,0.354680,-0.064835,-0.043216,0.015474,-0.199874,-0.143606,0.030614,-0.278599,0.225014,0.412606
1,0.417922,0.268999,-0.178459,0.550364,0.013612,-0.109792,0.188896,-0.401154,-0.299507,0.037386,...,0.210982,0.030503,0.023984,0.207573,-0.091514,-0.158943,-0.256492,-0.388204,0.118326,0.630413
2,0.371729,0.184470,-0.109809,0.538492,-0.018363,-0.122899,0.174846,-0.322036,-0.253975,0.149930,...,0.195369,0.055656,-0.002014,0.153352,-0.100005,-0.092151,-0.190907,-0.329606,0.138748,0.579584
3,0.600971,0.297439,-0.111911,0.559607,-0.009318,-0.121301,0.039457,-0.511400,-0.267049,0.043824,...,0.197636,0.098420,0.050000,0.115709,-0.344762,-0.118781,-0.265280,-0.364896,0.118582,0.554404
4,0.632840,0.296010,-0.152971,0.472369,0.017174,-0.056241,-0.054852,-0.481512,-0.234971,0.058114,...,0.096561,0.042519,0.041854,0.111225,-0.360057,-0.130901,-0.250063,-0.395767,0.058708,0.480001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9714,0.473619,0.279207,-0.284579,0.924196,0.012810,0.120156,0.576967,-0.260284,-0.462023,0.228403,...,0.502928,0.011061,-0.088991,-0.601341,-0.012736,-0.012142,-0.402243,-0.368734,0.283944,0.451678
9715,0.465921,0.144171,0.239050,0.226988,0.243660,0.216020,-0.203083,0.074115,-0.154608,-0.053208,...,-0.006437,0.287399,0.013951,0.897651,-0.058454,-0.129062,0.046819,0.023545,0.490284,0.319883
9716,0.343311,0.044909,0.007661,0.295671,-0.079544,0.002768,0.249804,-0.100755,-0.292439,0.076306,...,0.147471,0.178680,0.119526,0.180649,-0.018509,-0.032755,-0.182607,-0.131810,-0.009466,0.181573
9717,0.375617,0.224416,0.453209,0.353183,-0.164524,-0.599295,-0.121729,-0.892286,-0.784110,0.503425,...,0.115170,0.365424,0.473131,0.075799,-0.498786,-0.295321,0.449818,0.104628,0.132734,0.480187


In [33]:
ratings_pred_matrix = pd.DataFrame(data = pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.014377,3.883945,3.479784,4.404023,4.068524,1.213176,4.144984,2.185077,4.337481,3.703129,...,1.18233,3.3458,3.453337,2.580474,3.106538,3.885976,3.979394,2.341739,3.92844,0.79266
2,3.053739,3.186982,2.968619,3.87288,3.727942,1.113458,3.71171,1.809063,2.736354,3.06485,...,0.911296,3.889794,3.071641,2.506962,2.309363,4.021202,2.129816,1.627918,3.774734,0.728237
3,1.743848,1.223942,1.271098,1.700714,1.595127,0.420003,1.121711,0.876855,0.207292,1.503746,...,0.489179,2.365827,1.603514,1.528821,1.392357,2.537639,1.03932,0.632154,1.357239,0.321619


In [62]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [64]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Monty Python and the Holy Grail (1975),5.196588
Monty Python's Life of Brian (1979),5.070196
How the Grinch Stole Christmas! (1966),5.062015
"Lord of the Rings: The Return of the King, The (2003)",5.055983
Indiana Jones and the Last Crusade (1989),5.05579
"Pianist, The (2002)",4.890526
Midnight Cowboy (1969),4.881821
Austin Powers: International Man of Mystery (1997),4.879672
Austin Powers: The Spy Who Shagged Me (1999),4.875826
Star Wars: Episode V - The Empire Strikes Back (1980),4.874353


In [65]:
unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n=20)

recomm_movies = pd.DataFrame(data = recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Monty Python and the Holy Grail (1975),5.196588
Monty Python's Life of Brian (1979),5.070196
How the Grinch Stole Christmas! (1966),5.062015
"Lord of the Rings: The Return of the King, The (2003)",5.055983
Indiana Jones and the Last Crusade (1989),5.05579
"Pianist, The (2002)",4.890526
Midnight Cowboy (1969),4.881821
Austin Powers: International Man of Mystery (1997),4.879672
Austin Powers: The Spy Who Shagged Me (1999),4.875826
Star Wars: Episode V - The Empire Strikes Back (1980),4.874353


## Using Surprise Package