In [1]:
import os
import json

In [3]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD, evaluate
from surprise.model_selection import cross_validate, train_test_split

In [4]:
for line in open('user10reviews.json', 'r'):
    tenrevs = json.loads(line)

In [5]:
cut = [(r['user_id'], r['business_id'], r['stars']) for r in tenrevs]

In [14]:
print('Number of Reviews: '+str(len(cut)))

Number of Reviews: 2295089


In [18]:
ratings = [r['stars'] for r in tenrevs]

In [19]:
from collections import Counter
Counter(ratings)

Counter({3.0: 402943, 1.0: 172811, 2.0: 217710, 4.0: 749878, 5.0: 751747})

In [21]:
type(ratings[0])

float

In [7]:
businesses = [r['business_id'] for r in tenrevs]
unique_businesses = list(set(businesses))
users = [r['user_id'] for r in tenrevs]
unique_users = list(set(users))
print('Number of Unique Businesses: '+ str(len(unique_businesses)))
print('Number of Unique Users: '+ str(len(unique_users)))

Number of Unique Businesses: 73100
Number of Unique Users: 81416


In [8]:
indexed_users = {v: k for k, v in enumerate(unique_users)}
indexed_businesses = {v: k for k, v in enumerate(unique_businesses)}

In [9]:
indexed_data = list(map(lambda r: (indexed_users[r[0]], indexed_businesses[r[1]], r[2]), cut))

In [10]:
import csv
with open('finaldata.csv', 'w') as f:
    writer = csv.writer(f , lineterminator='\n')
    for tup in indexed_data:
        writer.writerow(tup)

In [11]:
file_path = os.path.expanduser('finaldata.csv')

In [12]:
reader = Reader(line_format='user item rating', sep=',')

In [13]:
data = Dataset.load_from_file(file_path, reader=reader)

In [11]:
data.split(n_folds=5)

In [16]:
svd = SVD()

In [17]:
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.0919
MAE:  0.8586
------------
Fold 2
RMSE: 1.0911
MAE:  0.8584
------------
Fold 3
RMSE: 1.0906
MAE:  0.8573
------------
Fold 4
RMSE: 1.0926
MAE:  0.8598
------------
Fold 5
RMSE: 1.0926
MAE:  0.8592
------------
------------
Mean RMSE: 1.0917
Mean MAE : 0.8587
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.0919205372150012,
                             1.0911238090890574,
                             1.0905641164574549,
                             1.0925744889239362,
                             1.0925621922072946],
                            'mae': [0.858605564047045,
                             0.8583596034838482,
                             0.8573090440636588,
                             0.8597598485326621,
                             0.8592491339068206]})

In [22]:
from surprise.model_selection import GridSearchCV

In [23]:
param_grid = {'biased': [True, False], 'lr_all': [0.001, 0.005, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [None]:
gs.fit(data)

In [15]:
svdpp = SVDpp()

In [16]:
evaluate(svdpp, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 1.2451
MAE:  0.9911
------------
Fold 2
RMSE: 1.2439
MAE:  0.9904
------------
Fold 3
RMSE: 1.2435
MAE:  0.9902
------------
Fold 4
RMSE: 1.2439
MAE:  0.9898
------------
Fold 5
RMSE: 1.2423
MAE:  0.9894
------------
Fold 6
RMSE: 1.2454
MAE:  0.9917
------------
Fold 7
RMSE: 1.2423
MAE:  0.9888
------------
Fold 8
RMSE: 1.2440
MAE:  0.9906
------------
Fold 9
RMSE: 1.2444
MAE:  0.9909
------------
Fold 10
RMSE: 1.2435
MAE:  0.9898
------------
------------
Mean RMSE: 1.2438
Mean MAE : 0.9903
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.245125078509003,
                             1.243877565662575,
                             1.243534904258009,
                             1.2439133989460236,
                             1.2422777819164104,
                             1.2454152231851392,
                             1.242282298531058,
                             1.2440367384772635,
                             1.244350336377197,
                             1.243543967201196],
                            'mae': [0.9910805188094426,
                             0.9903994289118054,
                             0.9901717944360934,
                             0.989838450409646,
                             0.9894138213447615,
                             0.9917496974624626,
                             0.9887548745704519,
                             0.9905598968497563,
                             0.9908861770725602,
                           

In [18]:
nmf = NMF()

In [19]:
evaluate(nmf, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm NMF.

------------
Fold 1
RMSE: 1.3794
MAE:  1.0783
------------
Fold 2
RMSE: 1.3801
MAE:  1.0787
------------
Fold 3
RMSE: 1.3803
MAE:  1.0797
------------
Fold 4
RMSE: 1.3790
MAE:  1.0774
------------
Fold 5
RMSE: 1.3789
MAE:  1.0773
------------
Fold 6
RMSE: 1.3810
MAE:  1.0798
------------
Fold 7
RMSE: 1.3789
MAE:  1.0777
------------
Fold 8
RMSE: 1.3796
MAE:  1.0786
------------
Fold 9
RMSE: 1.3799
MAE:  1.0785
------------
Fold 10
RMSE: 1.3790
MAE:  1.0779
------------
------------
Mean RMSE: 1.3796
Mean MAE : 1.0784
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.3794350253411565,
                             1.3800631753244952,
                             1.3803470449633781,
                             1.3790474183963373,
                             1.3788653317907658,
                             1.3809921876160387,
                             1.3788922356505084,
                             1.3796261813374224,
                             1.3799325331811927,
                             1.3790269179343493],
                            'mae': [1.0782924691081697,
                             1.078739278586975,
                             1.07971219559644,
                             1.077435373740628,
                             1.0773078568785115,
                             1.0797835618332265,
                             1.0776545359866114,
                             1.0785939513347529,
                             1.0785215794765843,
                        