# Example of SGD

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [2]:
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
              [np.NaN,np.NaN,3,4,4],
              [5,2,1,2,np.NaN]])

num_users, num_items = R.shape

#latent dimension k
k = 3

In [3]:
np.random.seed(123)
# Make P,Q with number with nurmal distribution
P = np.random.normal(scale =1./k, size =(num_users, k))
Q = np.random.normal(scale = 1./k, size =(num_items, k))

In [4]:
P, Q

(array([[-0.36187687,  0.33244848,  0.09432617],
        [-0.50209824, -0.19286675,  0.55047885],
        [-0.80889308, -0.14297088,  0.42197875],
        [-0.28891347, -0.22629538, -0.03156966]]),
 array([[ 0.49712988, -0.21296733, -0.14799399],
        [-0.14478376,  0.73531003,  0.7289287 ],
        [ 0.33468463,  0.1287288 ,  0.24578953],
        [ 0.49691068, -0.31194462,  0.39194301],
        [-0.41796022, -0.21258383,  0.3023684 ]]))

In [5]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [11]:
# non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

non_zeros = [(i,j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]

#Update P and Q with SGD
for step in range(steps):
    for i,j,r in non_zeros:
        #error rate
        eij = r - np.dot(P[i,:],Q[j,:].T)
        #SGD Update Formula
        P[i, :] = P[i,:] + learning_rate * (eij * Q[j, :] - r_lambda * P[i,:])
        Q[j, :] = Q[j,:] + learning_rate * (eij * P[i, :] - r_lambda * Q[j,:])
        
        rmse = get_rmse(R,P,Q, non_zeros)
        
    if(step % 50) == 0:
        print(" ### iteration step : ",step," rmse : ", rmse)      
        
        


 ### iteration step :  0  rmse :  0.013541726114983256
 ### iteration step :  50  rmse :  0.01353860972463358
 ### iteration step :  100  rmse :  0.01353554735741339
 ### iteration step :  150  rmse :  0.013532537994393842
 ### iteration step :  200  rmse :  0.013529580633845037
 ### iteration step :  250  rmse :  0.013526674291208969
 ### iteration step :  300  rmse :  0.01352381799903904
 ### iteration step :  350  rmse :  0.01352101080692677
 ### iteration step :  400  rmse :  0.013518251781403744
 ### iteration step :  450  rmse :  0.01351554000582595
 ### iteration step :  500  rmse :  0.01351287458024635
 ### iteration step :  550  rmse :  0.013510254621268985
 ### iteration step :  600  rmse :  0.01350767926189218
 ### iteration step :  650  rmse :  0.013505147651338579
 ### iteration step :  700  rmse :  0.013502658954878455
 ### iteration step :  750  rmse :  0.013500212353640918
 ### iteration step :  800  rmse :  0.013497807044418456
 ### iteration step :  850  rmse :  0.013

In [12]:
pred_matrix = np.dot(P,Q.T)
print('predicted matrix:\n',np.round(pred_matrix, 3))

predicted matrix:
 [[3.992 2.198 1.134 1.996 1.455]
 [3.247 4.978 1.808 2.989 1.008]
 [4.536 3.965 2.989 3.982 3.983]
 [4.975 2.    1.003 1.999 1.473]]


# TMDB Movie Dataset

In [13]:
import warnings; warnings.filterwarnings('ignore')

### Using Genre for Content Based Filtering 

In [14]:
movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [15]:
movies_df = movies[['id', 'title', 'genres', 'vote_average','vote_count',
                   'popularity','keywords','overview']]

In [16]:
movies_df.head(1)

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."


In [17]:
pd.set_option('max_colwidth', 100)
movies_df[['genres','keywords']][:2]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is..."


In [18]:
#convert genres to list
from ast import literal_eval

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [19]:
movies_df['genres'][:1], movies_df['keywords'][:1]

(0    [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...
 Name: genres, dtype: object,
 0    [{'id': 1463, 'name': 'culture clash'}, {'id': 2964, 'name': 'future'}, {'id': 3386, 'name': 'sp...
 Name: keywords, dtype: object)

In [20]:
#extract only genre name
movies_df['genres'] = movies_df['genres'].apply(lambda x: [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [y['name'] for y in x])


In [21]:
movies_df[['genres','keywords']].head(1)

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


In [22]:
# Content Similarity Using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df = 0, ngram_range = (1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [23]:

vars(genre_mat)

{'_shape': (4803, 276),
 'maxprint': 50,
 'indices': array([  0,  16, 124, ..., 251, 229,  80]),
 'indptr': array([    0,     9,    14, ..., 20630, 20630, 20631]),
 'data': array([1, 1, 1, ..., 1, 1, 1], dtype=int64)}

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
#represent genre's similarity each other
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
genre_sim
temp = pd.DataFrame(genre_sim)
temp

(4803, 4803)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
0,1.000000,0.596285,0.447214,0.125988,0.755929,0.596285,0.0,0.755929,0.447214,0.745356,...,0.000000,0.000000,0.000000,0.377964,0.000000,0.149071,0.0000,0.000000,0.0,0.0
1,0.596285,1.000000,0.400000,0.169031,0.338062,0.800000,0.0,0.338062,0.600000,0.800000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.0000,0.000000,0.0,0.0
2,0.447214,0.400000,1.000000,0.338062,0.507093,0.600000,0.0,0.507093,0.200000,0.600000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.400000,0.0000,0.000000,0.0,0.0
3,0.125988,0.169031,0.338062,1.000000,0.142857,0.169031,0.0,0.142857,0.000000,0.169031,...,0.377964,0.169031,0.377964,0.428571,0.218218,0.676123,0.0000,0.125988,0.0,0.0
4,0.755929,0.338062,0.507093,0.142857,1.000000,0.507093,0.0,1.000000,0.169031,0.507093,...,0.000000,0.000000,0.000000,0.428571,0.000000,0.169031,0.0000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.149071,0.200000,0.400000,0.676123,0.169031,0.200000,0.0,0.169031,0.000000,0.200000,...,0.000000,0.200000,0.000000,0.169031,0.258199,1.000000,0.0000,0.000000,0.0,0.0
4799,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.258199,0.000000,0.000000,0.000000,0.000000,1.0000,0.384900,0.0,0.0
4800,0.000000,0.000000,0.000000,0.125988,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.333333,0.149071,0.333333,0.125988,0.000000,0.000000,0.3849,1.000000,0.0,0.0
4801,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0


In [25]:
#use argsort to find index of higher similarity row

genre_sim_sorted_idx = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_idx[:2]

array([[   0, 3494,  813, ..., 3038, 3037, 2401],
       [ 262,    1,  129, ..., 3069, 3067, 2401]], dtype=int64)

In [26]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    
    title_idx = title_movie.index.values
    similar_indexes = sorted_idx[title_idx, :top_n]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [27]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average']][:10]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [28]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending = False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [29]:
"""
USE WEIGHTED RATING USED BY IMDB
v : vote_count
m : minimum_count
R : vote_average of each movie
C : average rating of total movie rating
"""

C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6) # top 60%
round(C, 3) , round(m,3)

(6.092, 370.2)

In [30]:
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/(v+m)) * R) + ((m/(m+v)) * C )

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [31]:
movies_df.columns

Index(['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity',
       'keywords', 'overview', 'genres_literal', 'weighted_vote'],
      dtype='object')

In [36]:
movies_df[['title','vote_average','vote_count','weighted_vote']].sort_values('weighted_vote', ascending = False)

Unnamed: 0,title,vote_average,vote_count,weighted_vote
1881,The Shawshank Redemption,8.5,8205,8.396052
3337,The Godfather,8.4,5893,8.263591
662,Fight Club,8.3,9413,8.216455
3232,Pulp Fiction,8.3,8428,8.207102
65,The Dark Knight,8.2,12002,8.136930
...,...,...,...,...
2122,Epic Movie,3.2,326,4.737894
242,Fantastic Four,4.4,2278,4.636554
3746,The Boy Next Door,4.1,1022,4.629738
210,Batman & Robin,4.2,1418,4.591725


In [37]:
def find_sim_movie(df, sorted_idx, title, top_n =10):
    title_movie = df[df['title'] == title]
    title_idx = title_movie.index.values
    
    similar_indexes = sorted_idx[title_idx, :top_n*2]
    similar_indexes = similar_indexes.reshape(-1)
    #delete input title
    similar_indexes = similar_indexes[similar_indexes != title_idx]
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_idx, 'The Godfather', 10)
similar_movies[['title','vote_average','weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427


## Item Based Collaborative Filtering

In [39]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
movies.shape , ratings.shape

((9742, 3), (100836, 4))

In [40]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [41]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [42]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')
ratings_matrix.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [43]:
rating_movies = pd.merge(ratings, movies, on ='movieId')

ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')

ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
#Similarity between Movies

ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)[:10]

title
Godfather, The (1972)                                    1.000000
Godfather: Part II, The (1974)                           0.821773
Goodfellas (1990)                                        0.664841
One Flew Over the Cuckoo's Nest (1975)                   0.620536
Star Wars: Episode IV - A New Hope (1977)                0.595317
Fargo (1996)                                             0.588614
Star Wars: Episode V - The Empire Strikes Back (1980)    0.586030
Fight Club (1999)                                        0.581279
Reservoir Dogs (1992)                                    0.579059
Pulp Fiction (1994)                                      0.575270
Name: Godfather, The (1972), dtype: float64

In [47]:
item_sim_df['Inception (2010)'].sort_values(ascending=False)[:10]

title
Inception (2010)                 1.000000
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Interstellar (2014)              0.608150
Up (2009)                        0.606173
Avengers, The (2012)             0.586504
Django Unchained (2012)          0.581342
Name: Inception (2010), dtype: float64

In [48]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501


In [49]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

get_mse(ratings_pred, ratings_matrix.values)

9.895354759094706

In [50]:
# predic_rating uses all movies similarities which makes accuracy lower
# make function to use movies have higher similarities

def predict_rating_topsim(ratings_arr, item_sim_arr, n =20):
    
    pred = np.zeros(ratings_arr.shape)
    
    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:,col])[:-n-1:-1]]
        
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
            
    return pred

In [51]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n =20)

In [52]:
#Top 20 mse
print(get_mse(ratings_pred, ratings_matrix.values))

ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

3.69501623729494


In [53]:
ratings_matrix.shape[1]

9719

In [54]:
user_rating_id = ratings_matrix.iloc[9,:]
user_rating_id[user_rating_id>0].sort_values(ascending=False)[:10]

title
Holiday, The (2006)              5.0
Intouchables (2011)              5.0
Spectre (2015)                   5.0
Casino Royale (2006)             5.0
Dark Knight Rises, The (2012)    5.0
The Intern (2015)                5.0
Troy (2004)                      5.0
Batman Begins (2005)             5.0
Despicable Me (2010)             5.0
King's Speech, The (2010)        5.0
Name: 10, dtype: float64

In [55]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [56]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711


## Latent Collaborative Filtering Using Matrix Factorization (SGD)

In [57]:
#Loss RMSE

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    # from R extract Non null value and calculate RMSE
    row_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #row idx
    col_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #column idx
    
    R_non_zeros = R[row_non_zero_idx, col_non_zero_idx]
    
    #extract non null values from predicted matrix
    full_pred_matrix_non_zeros = full_pred_matrix[row_non_zero_idx, col_non_zero_idx]
#     print(R_non_zeros)
#     print(full_pred_matrix_non_zeros)
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    
    return rmse    

In [64]:
def matrix_factorization(R, K, steps=2000, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    
    np.random.seed(123)
    P = np.random.normal(scale = 1./K, size=(num_users, K))
    Q = np.random.normal(scale = 1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_point = 0
    
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

    
    #SGD
    for step in range(steps):
        for i, j , r in non_zeros:
            
            eij = r - np.dot(P[i, :], Q[j,:].T)
            
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if(step % 100) == 0:
            print("### iteration step : ",step," rmse : ", rmse)
    
    return P,Q    

non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0]

      


In [65]:
movies = pd.read_csv('Movielens/movies.csv')
ratings = pd.read_csv('Movielens/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')

In [66]:
ratings_matrix.head(1)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,


In [67]:
rating_movies = pd.merge(ratings, movies, on ='movieId')
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns ='title')
ratings_matrix.head(1)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,


In [68]:
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [69]:
P,Q = matrix_factorization(ratings_matrix.values, K=50, steps =1000, learning_rate = 0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  2.8999710981870277
### iteration step :  100  rmse :  0.1765169145426336
### iteration step :  200  rmse :  0.14612031839016718
### iteration step :  300  rmse :  0.13715382892577338
### iteration step :  400  rmse :  0.13293194246177795
### iteration step :  500  rmse :  0.13039149611322778
### iteration step :  600  rmse :  0.12861157516382332
### iteration step :  700  rmse :  0.1272441183124939
### iteration step :  800  rmse :  0.12613194325589747
### iteration step :  900  rmse :  0.12519330680544494


In [70]:
pd.DataFrame(Q)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.036618,0.416438,-0.141054,0.435128,0.112433,-0.342097,0.091229,-0.578819,-0.061172,0.102669,...,0.354680,-0.064835,-0.043216,0.015474,-0.199874,-0.143606,0.030614,-0.278599,0.225014,0.412606
1,0.417922,0.268999,-0.178459,0.550364,0.013612,-0.109792,0.188896,-0.401154,-0.299507,0.037386,...,0.210982,0.030503,0.023984,0.207573,-0.091514,-0.158943,-0.256492,-0.388204,0.118326,0.630413
2,0.371729,0.184470,-0.109809,0.538492,-0.018363,-0.122899,0.174846,-0.322036,-0.253975,0.149930,...,0.195369,0.055656,-0.002014,0.153352,-0.100005,-0.092151,-0.190907,-0.329606,0.138748,0.579584
3,0.600971,0.297439,-0.111911,0.559607,-0.009318,-0.121301,0.039457,-0.511400,-0.267049,0.043824,...,0.197636,0.098420,0.050000,0.115709,-0.344762,-0.118781,-0.265280,-0.364896,0.118582,0.554404
4,0.632840,0.296010,-0.152971,0.472369,0.017174,-0.056241,-0.054852,-0.481512,-0.234971,0.058114,...,0.096561,0.042519,0.041854,0.111225,-0.360057,-0.130901,-0.250063,-0.395767,0.058708,0.480001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9714,0.473619,0.279207,-0.284579,0.924196,0.012810,0.120156,0.576967,-0.260284,-0.462023,0.228403,...,0.502928,0.011061,-0.088991,-0.601341,-0.012736,-0.012142,-0.402243,-0.368734,0.283944,0.451678
9715,0.465921,0.144171,0.239050,0.226988,0.243660,0.216020,-0.203083,0.074115,-0.154608,-0.053208,...,-0.006437,0.287399,0.013951,0.897651,-0.058454,-0.129062,0.046819,0.023545,0.490284,0.319883
9716,0.343311,0.044909,0.007661,0.295671,-0.079544,0.002768,0.249804,-0.100755,-0.292439,0.076306,...,0.147471,0.178680,0.119526,0.180649,-0.018509,-0.032755,-0.182607,-0.131810,-0.009466,0.181573
9717,0.375617,0.224416,0.453209,0.353183,-0.164524,-0.599295,-0.121729,-0.892286,-0.784110,0.503425,...,0.115170,0.365424,0.473131,0.075799,-0.498786,-0.295321,0.449818,0.104628,0.132734,0.480187


In [71]:
ratings_pred_matrix = pd.DataFrame(data = pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.014377,3.883945,3.479784,4.404023,4.068524,1.213176,4.144984,2.185077,4.337481,3.703129,...,1.18233,3.3458,3.453337,2.580474,3.106538,3.885976,3.979394,2.341739,3.92844,0.79266
2,3.053739,3.186982,2.968619,3.87288,3.727942,1.113458,3.71171,1.809063,2.736354,3.06485,...,0.911296,3.889794,3.071641,2.506962,2.309363,4.021202,2.129816,1.627918,3.774734,0.728237
3,1.743848,1.223942,1.271098,1.700714,1.595127,0.420003,1.121711,0.876855,0.207292,1.503746,...,0.489179,2.365827,1.603514,1.528821,1.392357,2.537639,1.03932,0.632154,1.357239,0.321619


In [72]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating>0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [73]:
def recommend_movie_by_userId(pred_df, userId, unseen_list, top_n =10):
    
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n = 20)

recomm_movies = pd.DataFrame(recomm_movies.values, index = recomm_movies.index, columns =['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Monty Python and the Holy Grail (1975),5.196588
Monty Python's Life of Brian (1979),5.070196
How the Grinch Stole Christmas! (1966),5.062015
"Lord of the Rings: The Return of the King, The (2003)",5.055983
Indiana Jones and the Last Crusade (1989),5.05579
"Pianist, The (2002)",4.890526
Midnight Cowboy (1969),4.881821
Austin Powers: International Man of Mystery (1997),4.879672
Austin Powers: The Spy Who Shagged Me (1999),4.875826
Star Wars: Episode V - The Empire Strikes Back (1980),4.874353


In [74]:
unseen_list = get_unseen_movies(ratings_matrix, 9)

recomm_movies = recommend_movie_by_userId(ratings_pred_matrix, 9, unseen_list, top_n=20)

recomm_movies = pd.DataFrame(data = recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Monty Python and the Holy Grail (1975),5.196588
Monty Python's Life of Brian (1979),5.070196
How the Grinch Stole Christmas! (1966),5.062015
"Lord of the Rings: The Return of the King, The (2003)",5.055983
Indiana Jones and the Last Crusade (1989),5.05579
"Pianist, The (2002)",4.890526
Midnight Cowboy (1969),4.881821
Austin Powers: International Man of Mystery (1997),4.879672
Austin Powers: The Spy Who Shagged Me (1999),4.875826
Star Wars: Episode V - The Empire Strikes Back (1980),4.874353
