In [1]:
#!pip install neo4j
#! pip install surprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import KNNBaseline
from surprise import SVDpp
from surprise import CoClustering
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import GridSearchCV
from neo4j import GraphDatabase
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import numpy as np


In [2]:
uri = "bolt://3.220.233.169:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "i-0e23d19f0d8795714"))

In [3]:
with driver.session() as session:
    result = session.run('\
    MATCH (u:User)-->(r:Review)-->(b:Business)\
    WHERE (b)-[:IN_CITY]->()-[:IN_STATE]->(:State {name:"PA"})\
    RETURN u.id, r.stars, b.id')

In [4]:
PA_reviews=result.values()

In [5]:
PA_reviewsdf=pd.DataFrame(PA_reviews)

In [6]:
PA_reviewsdf.head()

Unnamed: 0,0,1,2
0,HvDBT2whHCalPX7eWgmrqw,4.0,1RHY4K3BD22FK7Cfftn8Mg
1,HvDBT2whHCalPX7eWgmrqw,4.0,1RHY4K3BD22FK7Cfftn8Mg
2,rFeUNmWzV2sERSTc7-5Ujw,3.0,1RHY4K3BD22FK7Cfftn8Mg
3,V64IDJwHsxWqXxjSSBZZjg,5.0,1RHY4K3BD22FK7Cfftn8Mg
4,V-vNxP4p3lgesgQFepqL5g,5.0,1RHY4K3BD22FK7Cfftn8Mg


In [7]:
PA_reviewsdf.shape

(292321, 3)

In [8]:
reader = Reader(rating_scale=(1, 5))

In [9]:
PA_reviewsdf.columns=['userID', 'rating','itemID']

In [23]:
PA_reviewsdf.to_pickle('PA_reviews')

In [10]:
data = Dataset.load_from_df(PA_reviewsdf[['userID', 'itemID', 'rating']], reader)

In [147]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False,
               'min_support' : 8
              }

bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 5,
               'reg_i': 1
               }


algo=KNNBaseline(sim_options=sim_options, bsl_options=bsl_options, k=20, min_k=10)

In [148]:
cross_validate(algo, data, cv=3, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2431  1.2394  1.2448  1.2424  0.0022  
MAE (testset)     0.9859  0.9832  0.9876  0.9856  0.0018  
Fit time          11.02   9.40    6.27    8.90    1.97    
Test time         4.92    3.82    2.96    3.90    0.80    


{'test_rmse': array([1.24313092, 1.23940257, 1.24478002]),
 'test_mae': array([0.98590472, 0.9832448 , 0.98764356]),
 'fit_time': (11.021914005279541, 9.395414352416992, 6.273874282836914),
 'test_time': (4.916727304458618, 3.823942184448242, 2.9577760696411133)}

In [164]:
param_grid = {'bsl_options': {'method': ['als'],
                              'learning_rate': [.005],
                              'reg': [.2],
                              'reg_i': [1,3,5],
                              'reg_u': [1,3,5],
                              'n_epochs':[12]
                             },
              
              'k': [20,25,30],
              'min_k':[10,15],
              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [5, 10],
                              'user_based': [False]}
              }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

In [165]:
gs.fit(data)
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [171]:
pd.set_option('display.max_colwidth', 100)
sortedresults=results_df.sort_values('rank_test_rmse')

In [172]:
sortedresults.head(10)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_bsl_options,param_k,param_min_k,param_sim_options
71,1.23964,1.241723,1.240295,1.240552,0.00087,1,0.994761,0.9958,0.994514,0.995025,...,66,7.039587,0.431276,3.783595,1.187244,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",30,15,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
67,1.239643,1.241723,1.240294,1.240554,0.000869,2,0.994764,0.995801,0.994512,0.995026,...,67,7.451517,0.2031,2.938468,0.063423,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",25,15,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
63,1.239653,1.241732,1.240297,1.24056,0.000869,3,0.994767,0.995811,0.994518,0.995032,...,68,9.10125,0.135675,4.710221,0.015617,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",20,15,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
70,1.239773,1.241662,1.240288,1.240574,0.000797,4,0.994392,0.99547,0.994123,0.994662,...,61,7.844674,0.580605,3.138635,0.025655,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",30,15,"{'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}"
66,1.239789,1.241679,1.240311,1.240593,0.000797,5,0.994401,0.99549,0.994144,0.994678,...,62,7.333682,0.566861,3.39707,0.468243,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",25,15,"{'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}"
62,1.239826,1.24173,1.240361,1.240639,0.000802,6,0.994432,0.995532,0.994194,0.99472,...,63,9.087944,0.417236,4.623241,0.313946,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",20,15,"{'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}"
69,1.239633,1.241963,1.240538,1.240712,0.000959,7,0.994551,0.995957,0.994611,0.99504,...,69,9.332163,0.124512,4.526679,0.495131,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",30,10,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
65,1.239637,1.241964,1.240537,1.240713,0.000958,8,0.994555,0.995958,0.994609,0.99504,...,70,8.606471,0.632442,4.313073,1.004208,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",25,10,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
61,1.239646,1.241973,1.24054,1.24072,0.000958,9,0.994558,0.995968,0.994615,0.995047,...,71,9.441868,0.31369,4.288934,0.52285,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",20,10,"{'name': 'pearson_baseline', 'min_support': 10, 'user_based': False}"
68,1.240271,1.242572,1.241187,1.241343,0.000946,10,0.994516,0.995936,0.994516,0.994989,...,64,8.72406,0.788559,4.896676,0.319415,"{'bsl_options': {'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n...","{'method': 'als', 'learning_rate': 0.005, 'reg': 0.2, 'reg_i': 3, 'reg_u': 5, 'n_epochs': 12}",30,10,"{'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}"


In [180]:
sortedresults.to_pickle("SVDpp_smallgrid_pa")

In [62]:
cross_validate(algo, data, cv=3, verbose=True, n_jobs=1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4509  1.4407  1.4447  1.4454  0.0042  
MAE (testset)     1.1601  1.1519  1.1536  1.1552  0.0035  
Fit time          3.85    3.83    3.87    3.85    0.02    
Test time         1.43    1.42    1.41    1.42    0.01    


{'test_rmse': array([1.45087577, 1.44067006, 1.44470049]),
 'test_mae': array([1.16008751, 1.15186291, 1.15362173]),
 'fit_time': (3.853111743927002, 3.8317019939422607, 3.874323606491089),
 'test_time': (1.4318106174468994, 1.4176058769226074, 1.4055752754211426)}

In [67]:
algo2=SVD()

In [68]:
cross_validate(algo2, data, cv=3, verbose=True, n_jobs=1)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2743  1.2777  1.2788  1.2769  0.0019  
MAE (testset)     1.0368  1.0406  1.0402  1.0392  0.0017  
Fit time          3.48    3.51    3.52    3.51    0.02    
Test time         0.22    0.22    0.23    0.22    0.00    


{'test_rmse': array([1.27427863, 1.27765008, 1.27884736]),
 'test_mae': array([1.03682902, 1.0406015 , 1.04024859]),
 'fit_time': (3.4833498001098633, 3.510716199874878, 3.522801637649536),
 'test_time': (0.22365760803222656, 0.2214968204498291, 0.2267014980316162)}

In [70]:
algo3=CoClustering()
cross_validate(algo3, data, cv=3, verbose=True, n_jobs=1)

Evaluating RMSE, MAE of algorithm CoClustering on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4013  1.4103  1.4071  1.4063  0.0037  
MAE (testset)     1.0725  1.0814  1.0769  1.0769  0.0036  
Fit time          2.92    2.95    2.93    2.93    0.01    
Test time         0.16    0.16    0.16    0.16    0.00    


{'test_rmse': array([1.40134973, 1.41032607, 1.40712141]),
 'test_mae': array([1.07253277, 1.0814266 , 1.07687604]),
 'fit_time': (2.923271656036377, 2.9544529914855957, 2.9256558418273926),
 'test_time': (0.15801429748535156, 0.15717101097106934, 0.15750336647033691)}

In [18]:
param_grid = {'n_epochs': [15], 
              'lr_all': [0.01, 0.03],
              'reg_all': [0.3,0.5],
              'n_factors' : [20,30,40],
              'init_mean' : [0],
              'init_std_dev' : [.1]
              }

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

In [20]:
gs.fit(data)
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [21]:
pd.set_option('display.max_colwidth', 100)
sortedresults=results_df.sort_values('rank_test_rmse')
sortedresults.head(10)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all,param_n_factors,param_init_mean,param_init_std_dev
0,1.25287,1.245297,1.250435,1.249534,0.003156,1,1.019006,1.012364,1.016025,1.015799,...,5.965113,8.260731,0.910478,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.3, 'n_factors': 20, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.3,20,0,0.1
1,1.253179,1.24592,1.251131,1.250077,0.003056,2,1.019406,1.013103,1.016851,1.016453,...,1.934857,7.890102,0.474184,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.3, 'n_factors': 30, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.3,30,0,0.1
2,1.253351,1.246118,1.251406,1.250292,0.003056,3,1.019795,1.013595,1.017326,1.016906,...,2.601988,8.209969,0.442696,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.3, 'n_factors': 40, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.3,40,0,0.1
6,1.256573,1.249344,1.253302,1.253073,0.002956,4,1.008815,1.002967,1.004747,1.00551,...,1.173231,7.983595,0.48379,"{'n_epochs': 15, 'lr_all': 0.03, 'reg_all': 0.3, 'n_factors': 20, 'init_mean': 0, 'init_std_dev'...",15,0.03,0.3,20,0,0.1
7,1.256567,1.249465,1.25342,1.253151,0.002906,5,1.009575,1.003926,1.005448,1.006316,...,1.368912,8.323474,0.908997,"{'n_epochs': 15, 'lr_all': 0.03, 'reg_all': 0.3, 'n_factors': 30, 'init_mean': 0, 'init_std_dev'...",15,0.03,0.3,30,0,0.1
8,1.256536,1.249384,1.253781,1.253234,0.002945,6,1.010206,1.004481,1.006515,1.007067,...,0.765627,7.783102,0.081395,"{'n_epochs': 15, 'lr_all': 0.03, 'reg_all': 0.3, 'n_factors': 40, 'init_mean': 0, 'init_std_dev'...",15,0.03,0.3,40,0,0.1
3,1.259147,1.25167,1.256626,1.255814,0.003106,7,1.029922,1.023395,1.026634,1.02665,...,0.59375,7.529043,0.059967,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.5, 'n_factors': 20, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.5,20,0,0.1
4,1.259552,1.251896,1.256842,1.256097,0.00317,8,1.030374,1.023686,1.026937,1.026999,...,1.063979,8.192188,0.786928,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.5, 'n_factors': 30, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.5,30,0,0.1
5,1.259598,1.252117,1.257287,1.256334,0.003128,9,1.030453,1.024018,1.027474,1.027315,...,0.970805,8.251653,0.736238,"{'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.5, 'n_factors': 40, 'init_mean': 0, 'init_std_dev'...",15,0.01,0.5,40,0,0.1
9,1.261603,1.254098,1.258252,1.257984,0.00307,10,1.021561,1.015283,1.017133,1.017992,...,2.385013,7.510858,0.118098,"{'n_epochs': 15, 'lr_all': 0.03, 'reg_all': 0.5, 'n_factors': 20, 'init_mean': 0, 'init_std_dev'...",15,0.03,0.5,20,0,0.1


In [22]:
sortedresults.to_pickle("SVDpp_grid_pa")