# Model Testing
### Import libraries and assign variable names to data. 

In [1]:
import os
import json
import pandas as pd

In [2]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, accuracy
from surprise import dump
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV


In [3]:
file_path = os.path.expanduser('review_data.csv')

In [4]:
reader = Reader(line_format='user item rating', sep=',')

In [5]:
data = Dataset.load_from_file(file_path, reader=reader)

## Chosen Model: SVD (results of other models at the bottom)

In [7]:
svd = SVD()

In [8]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

{'test_rmse': array([1.092094  , 1.09059895, 1.09241337, 1.0921439 , 1.09083689]),
 'test_mae': array([0.85843544, 0.85734759, 0.85942355, 0.85954969, 0.85822846]),
 'fit_time': (159.57817721366882,
  151.42970180511475,
  158.18164038658142,
  159.7816367149353,
  151.8691599369049),
 'test_time': (7.196346044540405,
  6.307776927947998,
  6.693173885345459,
  7.134148120880127,
  6.588807821273804)}

## Gridsearch 1 -- Here we are tuning the learning rate and bias for GridSearch.

In [9]:
param_grid = {'biased': [True, False], 'lr_all': [0.001, 0.005, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [None]:
gs.fit(data)

In [None]:
results = gs.cv_results

In [26]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
#results_df.to_csv('results.csv')

In [27]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_biased,param_lr_all
0,1.121815,1.123022,1.122295,1.122377,0.000496,2,0.894917,0.89533,0.895943,0.895397,0.000421,3,101.287115,12.756166,55.179468,56.157928,"{'biased': True, 'lr_all': 0.001}",True,0.001
1,1.095546,1.096588,1.09628,1.096138,0.000437,1,0.863294,0.863555,0.863951,0.8636,0.00027,1,122.237027,4.096766,125.242564,10.203921,"{'biased': True, 'lr_all': 0.005}",True,0.005
2,1.145895,1.146561,1.146847,1.146434,0.000399,3,0.887526,0.887307,0.888239,0.88769,0.000398,2,415.258017,388.697796,2648.197713,2215.837836,"{'biased': True, 'lr_all': 0.05}",True,0.05
3,2.665475,2.674566,2.661799,2.66728,0.005366,6,2.331765,2.341412,2.32667,2.333283,0.006113,6,129.907277,11.421524,178.133461,49.572544,"{'biased': False, 'lr_all': 0.001}",False,0.001
4,1.397292,1.40031,1.397937,1.398513,0.001297,5,1.110692,1.112792,1.11097,1.111485,0.000931,5,107.601211,8.414771,120.542719,10.293368,"{'biased': False, 'lr_all': 0.005}",False,0.005
5,1.246154,1.24663,1.246168,1.246317,0.000221,4,0.982923,0.982756,0.983399,0.983026,0.000272,4,105.472106,6.496184,81.059657,12.616613,"{'biased': False, 'lr_all': 0.05}",False,0.05


#### Best LR = 0.005, Bias = True

## Gridsearch 2 -- Here we are tuning the n_factors for GridSearch.

In [47]:
param_grid = {'n_factors': [1, 5, 10], 'biased': [True], 'lr_all': [0.005],
              'n_epochs': [20]}
             
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [48]:
gs2.fit(data)

In [49]:
results = gs2.cv_results

In [50]:
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_biased,param_lr_all,param_n_epochs
0,1.086359,1.085331,1.084574,1.085421,0.000731,1,0.855883,0.854242,0.853917,0.85468,...,1,30.64818,0.463524,11.101873,1.299885,"{'n_factors': 1, 'biased': True, 'lr_all': 0.0...",1,True,0.005,20
1,1.086826,1.085677,1.085088,1.085864,0.000722,2,0.856256,0.854531,0.85441,0.855066,...,2,32.52039,1.041553,10.938498,0.78254,"{'n_factors': 5, 'biased': True, 'lr_all': 0.0...",5,True,0.005,20
2,1.08733,1.086413,1.0856,1.086447,0.000707,3,0.856862,0.855194,0.854764,0.855607,...,3,35.638312,0.684767,9.675809,0.923847,"{'n_factors': 10, 'biased': True, 'lr_all': 0....",10,True,0.005,20


#### Best - 1 factor (also tested higher values)

# Gridsearch 3 -- Here we are tuning the n_epochs for GridSearch.

In [19]:
param_grid = {'n_epochs': [10, 20, 50], 'n_factors': [1], 'biased': [True], 'lr_all': [0.005]}
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [20]:
gs3.fit(data)

In [21]:
results = gs3.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all
0,1.096411,1.095981,1.096711,1.096368,0.000299,2,0.869464,0.869122,0.869546,0.869377,...,2,32.706436,1.962642,11.108491,0.908315,"{'n_epochs': 10, 'n_factors': 50, 'biased': Tr...",10,50,True,0.005
1,1.091622,1.091048,1.091759,1.091476,0.000308,1,0.85999,0.859374,0.860053,0.859806,...,1,70.06996,8.85121,25.61329,14.807396,"{'n_epochs': 20, 'n_factors': 50, 'biased': Tr...",20,50,True,0.005
2,1.140297,1.13845,1.139194,1.139314,0.000759,3,0.885813,0.884742,0.884791,0.885116,...,3,196.621014,28.375468,20.432284,8.81673,"{'n_epochs': 50, 'n_factors': 50, 'biased': Tr...",50,50,True,0.005


#### Best n_epochs = 20

# Gridsearch 4 -- Here we are tuning the reg_all for GridSearch.

In [63]:
param_grid = {'n_epochs': [20], 'n_factors': [1], 'biased': [True], 
              'lr_all': [0.005], 'reg_all': [0.05, 0.06, 0.07]}
gs4 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


In [64]:
gs4.fit(data)

In [65]:
results = gs4.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all,param_reg_all
0,1.084832,1.08587,1.085679,1.08546,0.000451,3,0.855846,0.856065,0.855736,0.855882,...,32.002656,1.522894,11.47689,0.891822,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.05
1,1.084791,1.085866,1.085683,1.085446,0.00047,1,0.856182,0.856403,0.856082,0.856223,...,30.851868,0.170873,11.295406,1.275715,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.06
2,1.084823,1.085885,1.085647,1.085451,0.000455,2,0.856534,0.856766,0.856393,0.856565,...,29.938706,1.564058,10.551808,0.588312,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.07


In [66]:
results_df[['rank_test_rmse', 'rank_test_mae']]

Unnamed: 0,rank_test_rmse,rank_test_mae
0,3,1
1,1,2
2,2,3


#### Best reg = 0.06

In [19]:
cross_validate(nmf, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm NMF.

------------
Fold 1
RMSE: 1.3794
MAE:  1.0783
------------
Fold 2
RMSE: 1.3801
MAE:  1.0787
------------
Fold 3
RMSE: 1.3803
MAE:  1.0797
------------
Fold 4
RMSE: 1.3790
MAE:  1.0774
------------
Fold 5
RMSE: 1.3789
MAE:  1.0773
------------
Fold 6
RMSE: 1.3810
MAE:  1.0798
------------
Fold 7
RMSE: 1.3789
MAE:  1.0777
------------
Fold 8
RMSE: 1.3796
MAE:  1.0786
------------
Fold 9
RMSE: 1.3799
MAE:  1.0785
------------
Fold 10
RMSE: 1.3790
MAE:  1.0779
------------
------------
Mean RMSE: 1.3796
Mean MAE : 1.0784
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.3794350253411565,
                             1.3800631753244952,
                             1.3803470449633781,
                             1.3790474183963373,
                             1.3788653317907658,
                             1.3809921876160387,
                             1.3788922356505084,
                             1.3796261813374224,
                             1.3799325331811927,
                             1.3790269179343493],
                            'mae': [1.0782924691081697,
                             1.078739278586975,
                             1.07971219559644,
                             1.077435373740628,
                             1.0773078568785115,
                             1.0797835618332265,
                             1.0776545359866114,
                             1.0785939513347529,
                             1.0785215794765843,
                        

# Best Model (Using SVD and the parameters we tested during the iterations of GridSearch)

In [11]:
final = SVD(n_epochs=20, n_factors=1, biased=True, 
              lr_all=0.005, reg_all=0.06)

In [12]:
data = Dataset.load_from_file(file_path, reader=reader)
trainset, testset = train_test_split(data, test_size=.2)

In [13]:
import time
start = time.time()

In [14]:
final.fit(trainset)
runtime = time.time() - start
print(runtime)

50.581398010253906


In [15]:
predictions = final.test(testset)

In [16]:
accuracy.rmse(predictions)

RMSE: 1.0793


1.0792656933837443

In [19]:
accuracy.mae(predictions)

MAE:  0.8507


0.850677428991642

In [None]:
dump.dump('model', algo=final, predictions=predictions, verbose=1)

In [21]:
preds, model = dump.load('model')

# Getting Predictions

number_of_businesses = 73100


number_of_users = 81416

In [36]:
model.predict('30678', '51871')

Prediction(uid='30678', iid='51871', r_ui=None, est=3.4671454350928284, details={'was_impossible': False})

### Mapping restaurant information to item IDs

In [37]:
import json
with open('indexed_businesses.json') as f:
    businesses = json.load(f)

In [38]:
def find(f, seq):
    for item in seq:
        if f(item): 
            return item

In [39]:
def get_info(iid):
    return find(lambda b: iid == b['id'], businesses)

In [43]:
get_info(101)

{'business_id': 'qa6aqyXmFsOMLt___P6wLQ',
 'name': 'Forest Hills Coffee Co',
 'address': '2201 Ardmore Blvd',
 'city': 'Pittsburgh',
 'state': 'PA',
 'postal_code': '15221',
 'latitude': 40.418,
 'longitude': -79.8484835,
 'stars': 3.0,
 'review_count': 6,
 'is_open': 0,
 'attributes': {'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
  'OutdoorSeating': 'True',
  'WiFi': "u'free'",
  'BusinessAcceptsCreditCards': 'True',
  'RestaurantsPriceRange2': '2'},
 'categories': 'Food, Coffee & Tea, Restaurants',
 'hours': None,
 'id': 101}

In [28]:
def get_n_preds(uid, n):
    ratings = []
    for i in range(1, 73101):
        pred = model.predict(str(uid), str(i))
        ratings.append((int(pred.iid), pred.est))
    ratingsdesc = sorted(ratings, reverse=True, key=lambda x: x[1])[:n]
    namedratings = [(get_info(r[0])['name'], r[1]) for r in ratingsdesc]
    return namedratings

In [41]:
get_n_preds(44, 10)

[('China Express Chinese Restaurant', 4.524720520471293),
 ('Mariscos Y Barbacoa La Bella', 4.453280894829094),
 ('Les Canailles', 4.442511892228422),
 ('Amados Mexican Food', 4.408152132295895),
 ("Papa Joe's Subs & Pasta", 4.404376142513352),
 ('Melt Grilled Cheese', 4.392091928011274),
 ('Market District Express', 4.3785062769619225),
 ('Pizza Nova', 4.374699943702472),
 ('Meltwich Food Co', 4.37140705216406),
 ("The Dimkin's Crepes", 4.369352401880189)]

In [30]:
import csv
with open('review_data.csv', 'r') as f:
    reader = csv.reader(f)
    reviews = list(reader)

In [31]:
def get_reviewed_restaurants(uid, desc=True):
    userreviews = list(filter(lambda r: r[0] == str(uid), reviews))
    ratings = [r[2] for r in userreviews]
    restaurants = list(map(lambda r: get_info(int(r[1])), userreviews))
    names = [r['name'] for r in restaurants]
    if desc==True:
        return sorted(list(zip(names, ratings)), reverse=True, key=lambda x: x[1])
    return sorted(list(zip(names, ratings)), key=lambda x: x[1])

In [44]:
get_reviewed_restaurants(4111)

[('Fibo', '5.0'),
 ("Papa John's Pizza", '5.0'),
 ('Olive Garden Italian Restaurant', '5.0'),
 ('Prince BBQ', '5.0'),
 ('Wingstop', '5.0'),
 ('7-Eleven', '5.0'),
 ('Solstice Tavern', '5.0'),
 ('Sariwon Korean Bbq Restaurant', '5.0'),
 ('Darling Donuts', '5.0'),
 ('The Pig and Cow', '5.0'),
 ('Bootleggers Modern American Smokehouse', '5.0'),
 ('Circle K', '5.0'),
 ('Circus Buffet', '5.0'),
 ('Java House', '5.0'),
 ("Camarone's Cantina", '5.0'),
 ('Formocha', '4.0'),
 ("McDonald's", '4.0'),
 ('Los 7 Compas', '4.0'),
 ('CSI Coffee Pub', '3.0'),
 ('Waffle House', '3.0'),
 ("McDuffy's Sports Grill", '3.0'),
 ('LCBO', '3.0'),
 ('Little Pastry Chefs', '2.0'),
 ('Biryani Indian Restaurant', '2.0'),
 ('iBurger', '1.0')]