In [61]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
import seaborn as sns
%matplotlib inline

In [62]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def cf_rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [63]:
df_review_train = pd.read_csv("Montreal_review_train.csv")
df_review_test = pd.read_csv("Montreal_review_test.csv")
df_user_train = pd.read_csv("Montreal_user_train.csv")
df_user_test = pd.read_csv("Montreal_user_test.csv")
df_business_train = pd.read_csv("Montreal_business_train.csv")
df_business_test = pd.read_csv("Montreal_business_test.csv")

In [64]:
df_user_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,...,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since,elite_status
0,0,10,4.1,1302,41,1302,806,20,46,725,...,"[2016, 2014, 2015, 2017, 2012, 2011, 2013]",435,"['xRYvFaMGWsvKcLCFtRIzWQ', 'zvQ7B3KZuFOX7pYLsO...",4880,Risa,1122,26395,Wc5L6iuvSNF5WGBlqIO8nw,2011-07-30,Yes
1,1,13,3.79,1139,87,1139,782,54,103,391,...,"[2012, 2008, 2009, 2010, 2007, 2006, 2013, 2011]",198,"['KOwp5RDbm7cDyrdXN8FVQQ', '7MlH7OevWSkenMyKFI...",10715,Holly,698,24047,Dd-TkEszFMkSF-vRih51fQ,2006-07-03,Yes
2,2,37,3.74,129,2,129,77,6,12,56,...,"[2017, 2016, 2012, 2014, 2015, 2011, 2013]",68,"['Cq8uhBLRO1T9l-9R9OmddQ', 'x3_b9Rv-GZpjtCDLqg...",105,Jeff,754,151,YTdNcIWAt2nEzZ7NY-fniw,2011-05-16,Yes
3,3,110,4.07,60,1,60,51,1,16,19,...,"[2014, 2012, 2015, 2011, 2013, 2016, 2017]",33,"['8s7UH21vFgkRJAJg2L8VzA', 'HWGrt1MEXlzZ71NGx0...",9,Cecille,356,36,bTRFge5pRWMh7IoCLn7lBw,2007-08-03,Yes
4,4,117,3.64,23,2,23,31,0,3,13,...,"[2012, 2013]",15,"['G-Hav6XBWPEyzI-0nNpdxw', 'EgqsK7MUgqpbaTVZAv...",36,Carolina,115,89,-w7ww3yW5BHE3TFyj3IHuQ,2010-06-29,Yes


In [65]:
# checking dimensions and unique users before fitting the model
# count number of unique reviews
print(df_review_train.shape)
print(len(df_review_train['user_id'].unique()))
print(len(df_review_train['business_id'].unique()))

# count number of uniqe users
print(df_user_train.shape)
print(len(df_user_train['user_id'].unique()))

# count number of uniqe business
print(df_business_train.shape)
print(len(df_business_train['business_id'].unique()))

(30999, 11)
3201
2429
(3201, 25)
3201
(2429, 18)
2429


In [66]:
# creating a user/restaurant pivot table
pivot_review_train = df_review_train.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
pivot_review_test = df_review_test.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)

In [67]:
from scipy.sparse.linalg import svds

In [68]:
# Singular Value Decomposition (SVD) for users
def user_svd_predict(df, df_):
    R = df.as_matrix()
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    U = []
    sigma = []
    Vt = []
    U, sigma, Vt = svds(R_demeaned, k=20)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    average_rating = list(df_['average_stars'])
    average_rating = np.array(average_rating).reshape(-1, 1)
    average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
    all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
    return (all_user_predicted_ratings_total, all_user_predicted_ratings)

In [69]:
# Singular Value Decomposition (SVD) for business
def business_svd_predict(df, df_):
    R = df.as_matrix()
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    U = []
    sigma = []
    Vt = []
    U, sigma, Vt = svds(R_demeaned, k=20)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    average_rating = list(df_['stars'])
    average_rating = np.array(average_rating).reshape(-1, 1)
    average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
    all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
    return (all_user_predicted_ratings_total, all_user_predicted_ratings)

In [70]:
all_user_predicted_ratings_total = user_svd_predict(pivot_review_train, df_user_train)[0]
all_user_predicted_ratings_total_test = user_svd_predict(pivot_review_test, df_user_test)[0]
all_business_predicted_ratings_total = business_svd_predict(pivot_review_train.T, df_business_train)[0]
all_business_predicted_ratings_total_test = business_svd_predict(pivot_review_test.T, df_business_test)[0]

In [71]:
user_id_train = df_user_train['user_id']
user_id_test = df_user_test['user_id']

business_id_train = df_business_train['business_id']
business_id_test = df_business_test['business_id']

In [72]:
preds_df_train = pd.DataFrame(all_user_predicted_ratings_total, columns=pivot_review_train.columns, index=user_id_train)
preds_df_train.head()

business_id,-0uEqc2vw1xXtuI_r1xTNg,-1xuC540Nycht_iWFeJ-dw,-7bRnaHp7OHz8KW-THqP4w,-92cC6-X87HQ1DE1UHOx3w,-AgfhwHOYrsPKt-_xV_Ipg,-BPHhtX6zzI59IX7ZY-AQA,-FDkvLmwaBrtVgYFqEWeWA,-FPc3kwUU9GTDd4LzurvTQ,-GHqz1jGYzAtn27CeHeWeA,-HsqnPAz374YSoyFDyjl3A,...,zqV3T9HltH1pmlRFJJSFcA,zr2wA55AskfBJxrvUeDZRA,zrnP9HqoF-RI9jqoW8pytA,zsMMlOYtXm8SNy0bl1leBA,zsbsLCO-bw3gdNE9XNgBYw,zv92BYJH09YjFQOtSyYp-A,zwBEMcCVqh8wOXn_sOIfxg,zwgVuZcMgijt9k3Jq-2zQQ,zwkif4XLEDqdEwEgTWLIVQ,zzjKekzQ6i4iR-qpo405Pw
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wc5L6iuvSNF5WGBlqIO8nw,4.094261,4.240964,4.093483,4.127661,4.099962,4.098311,4.099023,4.09319,4.094802,4.143096,...,4.109433,4.13264,4.09452,4.115686,4.102577,4.093049,4.098747,4.0951,4.119639,4.098103
Dd-TkEszFMkSF-vRih51fQ,3.795136,3.815809,3.795515,3.798954,3.795709,3.795789,3.796298,3.795232,3.795024,3.803154,...,3.795918,3.795367,3.795871,3.796287,3.797605,3.795475,3.797292,3.795636,3.797662,3.795856
YTdNcIWAt2nEzZ7NY-fniw,3.749236,3.817079,3.748527,3.757432,3.750475,3.751114,3.749137,3.748381,3.748697,3.757516,...,3.756116,3.770139,3.747204,3.758025,3.747736,3.747685,3.747688,3.748463,3.756472,3.751034
bTRFge5pRWMh7IoCLn7lBw,4.09221,4.509308,4.090106,4.152235,4.101792,4.104338,4.097496,4.088443,4.089012,4.167343,...,4.132688,4.203549,4.084595,4.142995,4.095679,4.08572,4.093583,4.090215,4.140908,4.104515
-w7ww3yW5BHE3TFyj3IHuQ,3.639581,3.764347,3.639625,3.662383,3.638872,3.644714,3.642259,3.63867,3.636508,3.672317,...,3.651202,3.67361,3.638274,3.655888,3.643778,3.638415,3.641998,3.639788,3.654747,3.642414


In [73]:
preds_df_test = pd.DataFrame(all_user_predicted_ratings_total_test, columns=pivot_review_test.columns, index=user_id_test)
preds_df_test.head()

business_id,-0uEqc2vw1xXtuI_r1xTNg,-1xuC540Nycht_iWFeJ-dw,-7bRnaHp7OHz8KW-THqP4w,-92cC6-X87HQ1DE1UHOx3w,-AgfhwHOYrsPKt-_xV_Ipg,-FDkvLmwaBrtVgYFqEWeWA,-FPc3kwUU9GTDd4LzurvTQ,-HsqnPAz374YSoyFDyjl3A,-MwaICRwxaUi0JBfad2Y3Q,-Mz3M0g6iFZczs6a7ddf5g,...,zktCQRlDtF6XmOpqKBz1mA,zmQyE-gIUpwBCMmTFFRbJw,zpw5S3QwUse1MH-Eerbnaw,zqV3T9HltH1pmlRFJJSFcA,zr2wA55AskfBJxrvUeDZRA,zrnP9HqoF-RI9jqoW8pytA,zsMMlOYtXm8SNy0bl1leBA,zwBEMcCVqh8wOXn_sOIfxg,zwgVuZcMgijt9k3Jq-2zQQ,zwkif4XLEDqdEwEgTWLIVQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wc5L6iuvSNF5WGBlqIO8nw,4.10251,4.110007,4.102534,4.102885,4.102467,4.102479,4.102763,4.102247,4.102289,4.104428,...,4.10346,4.102425,4.102717,4.09952,4.103339,4.102584,4.10218,4.102489,4.102553,4.102519
Dd-TkEszFMkSF-vRih51fQ,3.790563,3.789336,3.790566,3.790516,3.790554,3.79056,3.79056,3.790465,3.790527,3.790369,...,3.790459,3.790552,3.790517,3.790075,3.790373,3.790565,3.790518,3.790547,3.79057,3.790566
YTdNcIWAt2nEzZ7NY-fniw,3.74223,3.737859,3.742241,3.742058,3.742187,3.742228,3.74224,3.741911,3.742093,3.741819,...,3.741991,3.742182,3.742086,3.739798,3.741622,3.742217,3.742059,3.742178,3.742257,3.74224
bTRFge5pRWMh7IoCLn7lBw,4.078794,4.12272,4.078413,4.079142,4.080528,4.078799,4.0791,4.084911,4.080099,4.087041,...,4.082517,4.079704,4.083314,4.125264,4.082772,4.077376,4.082331,4.079579,4.078196,4.078464
-w7ww3yW5BHE3TFyj3IHuQ,3.64245,3.654639,3.64234,3.643128,3.642614,3.642305,3.642277,3.641956,3.642936,3.641301,...,3.642076,3.642766,3.642744,3.665249,3.643117,3.642087,3.643506,3.642632,3.642247,3.642341


In [74]:
# simple recommendation system
def simple_recommend(user_id, count):
    df = preds_df_train.T[user_id].sort_values(ascending=False)
    return df.head(count)

In [75]:
# recommend the top ten restaurants to a user
simple_recommend('yML2P1evj7FrLncIgaFzHw', 10)

business_id
wzugmCevnXuCMCF4upAf0w    3.934835
FhgAHo-8--equM8w5UZ41Q    3.788509
IRIlwpomRvnXvpkeaGaM2A    3.763541
46Ld9Qc9nAx_A0jwclNZiw    3.747641
kKY726bQREexYHHNLK1H7g    3.746923
_K63HbZBVQSBCvQicQdl-A    3.743300
JN8s_dgw9nrSzkHnXxNOtg    3.741336
9xSwne4GjwZ6Hlzdx2Zszg    3.728956
y32M2Hkr7GsUqGG6KwOhZw    3.721766
58APdML-PG_OD4El2ePTvw    3.716126
Name: yML2P1evj7FrLncIgaFzHw, dtype: float64

In [76]:
from sklearn.metrics.pairwise import pairwise_distances

In [77]:
# use cosine distances to build a cosine similarity matrix for the users
user_similarity_train = 1 - pairwise_distances(user_svd_predict(pivot_review_train, df_user_train)[1], metric='cosine')
user_similarity_test = 1 - pairwise_distances(user_svd_predict(pivot_review_test, df_user_test)[1], metric='cosine')

In [78]:
# use cosine distances to build a business similarity matrix for the businesses
business_similarity_train = 1 - pairwise_distances(business_svd_predict(pivot_review_train.T, df_business_train)[1], metric='cosine')
business_similarity_test = 1 - pairwise_distances(business_svd_predict(pivot_review_test.T, df_business_test)[1], metric='cosine')

In [79]:
# train user similarity matrix
user_similarity_matrix_train = pd.DataFrame(user_similarity_train, columns=user_id_train, index=user_id_train)
user_similarity_matrix_train.head()

user_id,Wc5L6iuvSNF5WGBlqIO8nw,Dd-TkEszFMkSF-vRih51fQ,YTdNcIWAt2nEzZ7NY-fniw,bTRFge5pRWMh7IoCLn7lBw,-w7ww3yW5BHE3TFyj3IHuQ,4hAauH0dy57uK9o8bCvGUw,VMfwMYh8iJapW807Pu1Diw,lKRbcLWDQmOmhcMa3vMCMA,2vJ2e51kdbdAmAo_HTr4KQ,9KpMzih4E_gEioFtNeuIIw,...,v7q2D8s1vsglwQaQcyb8_A,hOYNnE3qzb8TDKd3jqvq7Q,LqywrHdM-H8gSdKtGrhBuw,iIIbkFd_kgK3n2ewvLstXA,KJIS0INMJKhBmGqFkHMc-A,Ih3dwaCS1snsbhS8vRdxHA,LY-KaOJyXzbwZyqjQfl7xA,e3XuTKzX3w8LP-mEqQgJ9g,awdAcl2dA_WvUPWKOCS1OA,0wXvG8Jiu8zdZhvezBgOwA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wc5L6iuvSNF5WGBlqIO8nw,1.0,0.720741,0.412029,0.720665,0.913932,0.815495,0.460906,0.891103,0.321446,0.128758,...,0.375954,0.595463,0.697501,0.289753,0.515562,0.015882,0.810667,0.730153,0.474537,0.494754
Dd-TkEszFMkSF-vRih51fQ,0.720741,1.0,0.53149,0.693314,0.700075,0.830051,0.561576,0.67344,0.521506,0.113499,...,0.338748,0.963002,0.818725,0.829342,0.670527,-0.057316,0.8087,0.825599,0.745091,0.32213
YTdNcIWAt2nEzZ7NY-fniw,0.412029,0.53149,1.0,0.916873,0.631012,0.71577,0.959554,0.682519,0.970352,0.104116,...,0.267434,0.422807,0.452622,0.317162,0.909885,0.269327,0.759309,0.313519,0.245549,0.900771
bTRFge5pRWMh7IoCLn7lBw,0.720665,0.693314,0.916873,1.0,0.855779,0.911067,0.896724,0.882128,0.849233,0.120329,...,0.306988,0.56397,0.591316,0.353743,0.919337,0.196887,0.921728,0.570053,0.458413,0.886281
-w7ww3yW5BHE3TFyj3IHuQ,0.913932,0.700075,0.631012,0.855779,1.0,0.826568,0.600692,0.873054,0.599827,-0.113172,...,0.187449,0.549721,0.541346,0.20923,0.71006,-0.042394,0.863534,0.702391,0.460178,0.651209


In [80]:
# test user similarity matrix
user_similarity_matrix_test = pd.DataFrame(user_similarity_test, columns=user_id_test, index=user_id_test)
user_similarity_matrix_test.head()

user_id,Wc5L6iuvSNF5WGBlqIO8nw,Dd-TkEszFMkSF-vRih51fQ,YTdNcIWAt2nEzZ7NY-fniw,bTRFge5pRWMh7IoCLn7lBw,-w7ww3yW5BHE3TFyj3IHuQ,4hAauH0dy57uK9o8bCvGUw,VMfwMYh8iJapW807Pu1Diw,lKRbcLWDQmOmhcMa3vMCMA,2vJ2e51kdbdAmAo_HTr4KQ,9KpMzih4E_gEioFtNeuIIw,...,v7q2D8s1vsglwQaQcyb8_A,hOYNnE3qzb8TDKd3jqvq7Q,LqywrHdM-H8gSdKtGrhBuw,iIIbkFd_kgK3n2ewvLstXA,KJIS0INMJKhBmGqFkHMc-A,Ih3dwaCS1snsbhS8vRdxHA,LY-KaOJyXzbwZyqjQfl7xA,e3XuTKzX3w8LP-mEqQgJ9g,awdAcl2dA_WvUPWKOCS1OA,0wXvG8Jiu8zdZhvezBgOwA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wc5L6iuvSNF5WGBlqIO8nw,1.0,0.792956,0.816564,0.379237,0.270042,0.789788,0.79494,0.841365,-0.042986,0.743116,...,0.824476,0.53765,0.857398,0.924114,0.89095,0.705694,0.786955,0.826806,0.828485,0.86585
Dd-TkEszFMkSF-vRih51fQ,0.792956,1.0,0.993514,0.366125,0.308845,0.983085,0.981985,0.88523,-0.051364,0.89801,...,0.995281,0.790401,0.907546,0.954671,0.811427,0.314816,0.701796,0.991925,0.656752,0.877802
YTdNcIWAt2nEzZ7NY-fniw,0.816564,0.993514,1.0,0.373322,0.264219,0.989574,0.977486,0.913252,-0.036936,0.914095,...,0.990583,0.813819,0.913914,0.961988,0.826734,0.369758,0.746545,0.994725,0.713984,0.904612
bTRFge5pRWMh7IoCLn7lBw,0.379237,0.366125,0.373322,1.0,0.839735,0.36834,0.531877,0.521869,0.250402,0.643674,...,0.382215,0.505986,0.478432,0.429181,0.520249,0.541035,0.645821,0.389246,0.582344,0.429117
-w7ww3yW5BHE3TFyj3IHuQ,0.270042,0.308845,0.264219,0.839735,1.0,0.276764,0.456102,0.300552,-0.040324,0.475806,...,0.326871,0.30528,0.303664,0.354267,0.466218,0.323128,0.345939,0.275958,0.315474,0.312387


In [81]:
# train business similarity matrix
business_similarity_matrix_train = pd.DataFrame(business_similarity_train, columns=business_id_train, index=business_id_train)
business_similarity_matrix_train.head()

business_id,58APdML-PG_OD4El2ePTvw,8Rdz0VPY8CuT2GQZ7ho2sw,DAMTCTsSeACXbkSABkhZqQ,6I6uDGwCDggrWXi2T4lfaA,qUdGBSFkiPhEL6I718y-Gg,ujcbqs6jZfaESgSLvbjWuQ,XjbPr3o-YTsticeavLjTEg,Y22IfhXChXoRp3vKi6QwaQ,MhINNBBwzGn4-n_YI67wog,OLg1IeS-QxZgNprQ4Hg9gg,...,LLBmqBunk40IHdHH_QfjkA,-ZHeHh4bwLlecbcAD7fTqw,SnD7fcwR4NR7Cgtx7Qm4ZQ,ml7HQlaAcszdBZZHljvYgg,Y5I-z2S3Eeno6cDyn0e6Cg,ODZLMTbjCnpDNkW1JbMjlQ,kWDAdT4m3vbnmE0CgLs4gA,rofWaZTIuaedAxT_UKleSw,bYfEp3NMskYfEzWL8tVb4w,HzUxQ1WpeNmeecXN-HPlPw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58APdML-PG_OD4El2ePTvw,1.0,0.604929,0.306082,0.684301,0.534033,0.56272,0.524335,0.474005,0.54305,0.687466,...,0.869826,0.768017,-0.103955,0.787516,0.251441,0.14667,0.37393,0.649073,0.811284,0.55929
8Rdz0VPY8CuT2GQZ7ho2sw,0.604929,1.0,0.771181,0.668666,0.446506,0.806925,0.40733,0.237145,0.190323,0.863641,...,0.625452,0.585623,0.172381,0.678374,0.591647,0.155667,0.813378,0.515024,0.721343,0.82473
DAMTCTsSeACXbkSABkhZqQ,0.306082,0.771181,1.0,0.602944,0.192602,0.909381,0.311445,0.273086,-0.225258,0.532435,...,0.507281,0.601425,0.140032,0.577847,0.582765,0.133754,0.715401,0.03736,0.430301,0.808834
6I6uDGwCDggrWXi2T4lfaA,0.684301,0.668666,0.602944,1.0,0.601799,0.764556,0.57964,0.466201,0.330502,0.82399,...,0.905221,0.948628,0.225421,0.931421,0.803133,0.17724,0.568698,0.37207,0.904395,0.768868
qUdGBSFkiPhEL6I718y-Gg,0.534033,0.446506,0.192602,0.601799,1.0,0.215027,0.283366,0.191661,0.6801,0.482624,...,0.702884,0.575249,0.144105,0.644695,0.478036,-0.127833,0.296017,0.236424,0.727604,0.681321


In [82]:
# test business similarity matrix
business_similarity_matrix_test = pd.DataFrame(business_similarity_test, columns=business_id_test, index=business_id_test)
business_similarity_matrix_test.head()

business_id,58APdML-PG_OD4El2ePTvw,DAMTCTsSeACXbkSABkhZqQ,6I6uDGwCDggrWXi2T4lfaA,qUdGBSFkiPhEL6I718y-Gg,ujcbqs6jZfaESgSLvbjWuQ,Y22IfhXChXoRp3vKi6QwaQ,MhINNBBwzGn4-n_YI67wog,OLg1IeS-QxZgNprQ4Hg9gg,DwJlGxAJvohbDR_5jV-ERA,i5j3FrxdR224KIjfv8x2CQ,...,3uu5jvP5JKdSUW9jk-HO7A,Akhq4AKxKRDPa6BHpiSEVQ,LLBmqBunk40IHdHH_QfjkA,-ZHeHh4bwLlecbcAD7fTqw,SnD7fcwR4NR7Cgtx7Qm4ZQ,ml7HQlaAcszdBZZHljvYgg,Y5I-z2S3Eeno6cDyn0e6Cg,rofWaZTIuaedAxT_UKleSw,bYfEp3NMskYfEzWL8tVb4w,HzUxQ1WpeNmeecXN-HPlPw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58APdML-PG_OD4El2ePTvw,1.0,0.178899,0.951031,0.691397,0.259757,0.940552,0.816696,0.776824,0.535893,0.260423,...,-0.007849,0.824773,0.390546,0.062772,0.329732,0.578336,0.9152,0.217883,0.971168,0.982351
DAMTCTsSeACXbkSABkhZqQ,0.178899,1.0,0.210156,0.657375,0.310377,0.164341,0.301864,0.46197,0.494442,0.645157,...,0.623212,0.415439,0.526627,0.370224,0.874367,0.416135,0.267851,0.607619,0.164944,0.169984
6I6uDGwCDggrWXi2T4lfaA,0.951031,0.210156,1.0,0.749799,0.151422,0.90013,0.774837,0.741639,0.50473,0.219069,...,-0.05207,0.696736,0.268463,-0.109877,0.366669,0.718121,0.81446,0.064799,0.984445,0.990368
qUdGBSFkiPhEL6I718y-Gg,0.691397,0.657375,0.749799,1.0,0.193305,0.649815,0.592183,0.545754,0.660185,0.520032,...,0.320945,0.654766,0.33788,0.193709,0.740265,0.487023,0.675719,0.436975,0.703693,0.708155
ujcbqs6jZfaESgSLvbjWuQ,0.259757,0.310377,0.151422,0.193305,1.0,0.164879,0.139443,0.592555,0.200928,0.207123,...,-0.063087,0.316322,0.884324,0.251879,0.063906,0.038891,0.210878,0.226732,0.215547,0.201362


In [83]:
# merge datasets
df1 = pd.merge(df_user_train, df_review_train, on='user_id')
df_train_total = pd.merge(df1, df_business_train, on='business_id')

df2 = pd.merge(df_user_test, df_review_test, on='user_id')
df_test_total = pd.merge(df2, df_business_test, on='business_id')

In [84]:
# create pivot tables
pivot_user_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_user_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_business_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)
pivot_business_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)

In [85]:
# make a user-business train matrix with train set mean
items_train = df_review_train.shape[0]
total_train = np.sum(df_review_train['stars'])
global_mean_train = total_train / items_train
print(global_mean_train)
pivot_user_train[pivot_user_train != 0] = global_mean_train

# make a user-business test matrix with test set mean
items_test = df_review_test.shape[0]
total_test = np.sum(df_review_test['stars'])
global_mean_test = total_test / items_test
print(global_mean_test)
pivot_user_test[pivot_user_test != 0] = global_mean_test

3.820671634568857
3.8194511314395765


In [86]:
# predict function to predict the user-user & restaurant-restaurant residuals
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [87]:
# calculate train set residuals using item similarities matrix and restaurant similarities
train_data_matrix = user_svd_predict(pivot_review_train, df_user_train)[1]
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# calculate test set residuals using item similarities matrix and restaurant similarities
test_data_matrix = user_svd_predict(pivot_review_test, df_user_test)[1]
item_similarity_test = pairwise_distances(test_data_matrix.T, metric='cosine')
user_similarity_test = pairwise_distances(test_data_matrix, metric='cosine')
item_prediction_test = predict(test_data_matrix, item_similarity_test, type='item')
user_prediction_test = predict(test_data_matrix, user_similarity_test, type='user')

In [88]:
# converting dataframes into matrices
pivot_train = pivot_review_train.as_matrix()
pivot_test = pivot_review_test.as_matrix()
pivot_user_train_ = pivot_user_train.as_matrix()
pivot_user_test_ = pivot_user_test.as_matrix()
pivot_business_train_ = pivot_business_train.as_matrix()
pivot_business_test_ = pivot_business_test.as_matrix()

In [89]:
# predict on train set: add the item-item & restaurant-restaurant residuals to the train set mean rating
pivot_pred_train = np.add(item_prediction, pivot_user_train_)
pivot_pred_train = np.add(pivot_pred_train, user_prediction)

# predict on trest set: add the item-item & restaurant-restaurant residuals to the test set mean rating
pivot_pred_test = np.add(item_prediction_test, pivot_user_test_)
pivot_pred_test = np.add(pivot_pred_test, user_prediction_test)

In [90]:
# calculate RMSE for test and train set
print("CF Train RMSE score is {}".format(cf_rmse(pivot_train, pivot_pred_train)))
print("CF Test RMSE score is {}".format(cf_rmse(pivot_test, pivot_pred_test)))

CF Train RMSE score is 0.09213448455145726
CF Test RMSE score is 0.05653953923441608
