# Recommender systems: a celebration of collaborative filtering and content filtering

## Result summary

In [1]:
import sys
import traceback
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from scipy import sparse

import pickle

from IPython.display import display, HTML, Markdown

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
pd.set_option('display.width', 15000)
pd.set_option('display.max_columns', 100)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_context('poster')
%matplotlib inline

from surprise import Dataset, Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, \
SlopeOne, CoClustering, KNNBasic, KNNWithMeans, KNNBaseline

from recommender import plot_cm, get_results, show_results, IO, \
show_summaries, get_base_predictions, get_multi_base_predictions
from recommender import ModeClassifier, BaselineMean, BaselineRegression, ALS1, ALS2, RS_surprise, RS_ensemble

In [2]:
cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    data_dir = 'data/{}/'.format(city)
    model_names = IO(data_dir + 'results/model_names.pkl').read_pickle()
    results = IO(data_dir + 'results/results.pkl').read_pickle()
    is_successful = IO(data_dir + 'results/is_successful.pkl').read_pickle()
    sizes = IO(data_dir + 'sizes.pkl').read_pickle()
    
    display(Markdown('## {} <sup>({} reviews, {} restaurants, {} users)</sup>'.\
                 format(city, sizes[0], sizes[1], sizes[2])))
    display(Markdown('**Collaborative filtering**'))
    show_summaries(model_names, results, is_successful)
    display(Markdown('<sup>(* shows the algorithms we implemented by wrapping around \
    methods in scikit-surprise python package)</sup>'))
    
    model_names = IO(data_dir + 'results05/model_names.pkl').read_pickle()
    results = IO(data_dir + 'results05/results.pkl').read_pickle()
    is_successful = IO(data_dir + 'results05/is_successful.pkl').read_pickle()
    #sizes = IO(data_dir + 'sizes.pkl').read_pickle()

    #display(Markdown('## {} <sup>({} reviews, {} restaurants, {} users)</sup>'.\
                     #format(city, sizes[0], sizes[1], sizes[2])))
    display(Markdown('**Content filtering**'))
    show_summaries(model_names, results, is_successful)
    
    model_names = IO(data_dir + 'results06/model_names.pkl').read_pickle()
    results = IO(data_dir + 'results06/results.pkl').read_pickle()
    is_successful = IO(data_dir + 'results06/is_successful.pkl').read_pickle()
    #sizes = IO(data_dir + 'sizes.pkl').read_pickle()

    #display(Markdown('## {} <sup>({} reviews, {} restaurants, {} users)</sup>'.\
                     #format(city, sizes[0], sizes[1], sizes[2])))
    display(Markdown('**Ensemble**'))
    show_summaries(model_names, results, is_successful)
    display(Markdown('<sup>(Ensemble1 represents the ensemble of collaborative filtering models; \
    Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>'))
        
    display(Markdown('''
    
    '''))

## Champaign <sup>(20571 reviews, 878 restaurants, 8451 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.9995,2.0258,-0.9501,-0.95
Normal predictor*,0.087,1.8584,1.8823,-0.6846,-0.6835
Baseline (mean),0.019,0.9485,1.4648,0.5612,-0.0195
Baseline (regression),0.046,0.8532,1.3306,0.6449,0.1587
Baseline (ALS)*,0.055,1.1981,1.32,0.2998,0.1721
KNN (basic)*,0.9481,0.4328,1.4642,0.9086,-0.0187
KNN (with means)*,1.0791,0.5898,1.531,0.8303,-0.1138
KNN (baseline)*,1.2141,0.4175,1.3718,0.915,0.1058
SVD-ALS1,23.8384,0.6702,1.3067,0.7809,0.1886
SVD-ALS2,25.5235,0.6712,1.3104,0.7803,0.1841


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,0.069,1.0773,1.0971,0.4339,0.428
Random forest,1.0951,1.0262,1.0862,0.4864,0.4394


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.8115,1.3059,0.6788,0.1897
Ensemble1 (Ridge regression),0.009,1.1826,1.3031,0.3178,0.1931
Ensemble1 (random forest),0.223,0.992,1.3017,0.52,0.1949
Ensemble2 (weighted average),0.0,0.8826,1.1566,0.62,0.3643
Ensemble2 (Ridge regression),0.004,1.2147,1.0813,0.2803,0.4444
Ensemble2 (random forest),0.272,1.0645,1.085,0.4473,0.4406


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    

## Cleveland <sup>(75932 reviews, 2500 restaurants, 30131 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.8152,1.8262,-0.8226,-0.8371
Normal predictor*,0.205,1.7514,1.7529,-0.6968,-0.6926
Baseline (mean),0.053,0.8908,1.3417,0.561,0.0084
Baseline (regression),0.14,0.8088,1.2235,0.6381,0.1753
Baseline (ALS)*,0.263,1.1171,1.217,0.3097,0.1841
KNN (basic)*,14.1758,0.3952,1.3484,0.9136,-0.0016
KNN (with means)*,12.5127,0.56,1.402,0.8265,-0.0829
KNN (baseline)*,12.6187,0.3837,1.2612,0.9186,0.1237
SVD-ALS1,79.7376,0.568,1.2103,0.8215,0.1931
SVD-ALS2,83.2458,0.5696,1.213,0.8206,0.1895


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,0.286,1.0195,1.0313,0.4251,0.4141
Random forest,4.7943,0.9929,1.0155,0.4546,0.432


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.7191,1.2072,0.714,0.1971
Ensemble1 (Ridge regression),0.005,0.8871,1.2043,0.5646,0.2011
Ensemble1 (random forest),0.721,0.8811,1.2063,0.5706,0.1984
Ensemble2 (weighted average),0.0,0.8028,1.0899,0.6435,0.3456
Ensemble2 (Ridge regression),0.006,1.0329,1.0143,0.4099,0.4333
Ensemble2 (random forest),0.9471,0.9987,1.0181,0.4483,0.429


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    

## Pittsburgh <sup>(143682 reviews, 4745 restaurants, 46179 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.8026,1.7988,-0.8466,-0.8393
Normal predictor*,0.468,1.7363,1.7294,-0.7134,-0.7
Baseline (mean),0.11,0.9052,1.3198,0.5343,0.0099
Baseline (regression),0.763,0.8444,1.2042,0.5948,0.1758
Baseline (ALS)*,0.501,1.1119,1.202,0.2974,0.1788
SVD-ALS1,153.0878,0.5579,1.1959,0.8231,0.1871
SVD-ALS2,173.3719,0.5597,1.2014,0.822,0.1796
SVD-SGD*,7.6164,0.8267,1.2046,0.6116,0.1752
SVD++-SGD*,43.6755,0.8738,1.2025,0.5661,0.178
NMF-SGD*,8.6365,0.3666,1.3761,0.9236,-0.0765


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,0.55,1.0158,1.0062,0.4135,0.4245
Random forest,10.1126,0.9938,0.9896,0.4388,0.4434


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.7684,1.19,0.6645,0.1951
Ensemble1 (Ridge regression),0.013,0.8872,1.1872,0.5527,0.1988
Ensemble1 (random forest),1.3041,0.8644,1.1891,0.5754,0.1962
Ensemble2 (weighted average),0.0,0.841,1.0594,0.5981,0.362
Ensemble2 (Ridge regression),0.015,0.9928,0.9881,0.4399,0.445
Ensemble2 (random forest),1.7021,0.9995,0.9929,0.4323,0.4396


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    

## Toronto <sup>(331407 reviews, 12118 restaurants, 77506 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.883,1.8801,-1.1173,-1.1219
Normal predictor*,1.0371,1.7052,1.7137,-0.7364,-0.7629
Baseline (mean),0.261,0.9293,1.2911,0.4843,-0.0006
Baseline (regression),1.1211,0.878,1.1828,0.5397,0.1602
Baseline (ALS)*,1.7001,1.0916,1.173,0.2884,0.174
SVD-ALS1,366.1459,0.5554,1.1745,0.8158,0.1719
SVD-ALS2,370.8472,0.557,1.1795,0.8147,0.1649
SVD-SGD*,17.735,0.8222,1.1772,0.5963,0.1681
SVD++-SGD*,122.057,0.873,1.1763,0.5449,0.1694
NMF-SGD*,20.0961,0.4094,1.3369,0.8999,-0.0729


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,1.1401,1.0049,1.0035,0.397,0.3955
Random forest,27.0035,0.9891,0.9909,0.4158,0.4106


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.7721,1.1648,0.644,0.1856
Ensemble1 (Ridge regression),0.029,0.9701,1.1617,0.438,0.1898
Ensemble1 (random forest),3.2702,0.9747,1.165,0.4327,0.1853
Ensemble2 (weighted average),0.0,0.8419,1.0529,0.5767,0.3346
Ensemble2 (Ridge regression),0.036,1.0278,0.9876,0.3692,0.4145
Ensemble2 (random forest),4.1732,0.9946,0.9957,0.4093,0.4048


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    

## Las_Vegas <sup>(1280896 reviews, 20434 restaurants, 429363 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.906,1.9073,-0.7549,-0.7578
Normal predictor*,5.2523,1.8586,1.8615,-0.6687,-0.6744
Baseline (mean),1.1551,0.999,1.4148,0.5179,0.0329
Baseline (regression),8.1045,0.9286,1.2855,0.5835,0.2015
Baseline (ALS)*,7.3574,1.188,1.2696,0.3182,0.2211
SVD-ALS1,1350.0652,0.421,1.2787,0.9144,0.2099
SVD-ALS2,1345.87,0.4218,1.2864,0.914,0.2004
SVD-SGD*,69.998,0.7758,1.2827,0.7093,0.205
SVD++-SGD*,336.7173,0.8046,1.302,0.6873,0.1809
NMF-SGD*,87.552,0.4178,1.4916,0.9157,-0.075


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,5.0093,1.1216,1.1226,0.3923,0.3911
Random forest,154.2278,1.1008,1.1029,0.4146,0.4122


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.7228,1.264,0.7476,0.228
Ensemble1 (Ridge regression),0.119,0.9909,1.2605,0.5257,0.2323
Ensemble1 (random forest),13.3968,1.0422,1.2631,0.4753,0.229
Ensemble2 (weighted average),0.0,0.8461,1.1686,0.6542,0.3402
Ensemble2 (Ridge regression),0.147,1.1212,1.1013,0.3928,0.4139
Ensemble2 (random forest),16.1289,1.1063,1.1084,0.4088,0.4064


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    

## Full <sup>(4166778 reviews, 131025 restaurants, 1117891 users)</sup>

**Collaborative filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Mode estimator,0.0,1.8974,1.8985,-0.7803,-0.7799
Normal predictor*,16.986,1.84,1.8404,-0.6741,-0.6725
Baseline (mean),4.2512,1.0178,1.4063,0.4878,0.0234
Baseline (regression),32.7739,0.938,1.2755,0.5649,0.1966
Baseline (ALS)*,25.2794,1.1754,1.2659,0.3169,0.2086
SVD-ALS1,4324.7944,0.5255,1.2685,0.8634,0.2054
SVD-ALS2,4361.1094,0.5267,1.2758,0.8628,0.1963
SVD-SGD*,248.9762,0.8312,1.2721,0.6584,0.2008
SVD++-SGD*,1463.7867,0.8713,1.2784,0.6246,0.193
NMF-SGD*,330.5729,0.4277,1.4656,0.9095,-0.0607


<sup>(* shows the algorithms we implemented by wrapping around     methods in scikit-surprise python package)</sup>

**Content filtering**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ridge regression,17.161,1.0857,1.0869,0.4171,0.4167
Random forest,663.2849,1.0639,1.0653,0.4403,0.4396


**Ensemble**

model,fitting time (s),train RMSE,test RMSE,train $R^2$,test $R^2$
Ensemble1 (weighted average),0.0,0.7803,1.2557,0.699,0.2214
Ensemble1 (Ridge regression),0.424,1.0004,1.2523,0.5051,0.2256
Ensemble1 (random forest),56.5742,1.0303,1.2581,0.4751,0.2184
Ensemble2 (weighted average),0.0,0.8688,1.1421,0.6268,0.3558
Ensemble2 (Ridge regression),0.545,1.1033,1.0617,0.398,0.4434
Ensemble2 (random forest),68.1969,1.0687,1.0702,0.4352,0.4344


<sup>(Ensemble1 represents the ensemble of collaborative filtering models;     Ensemble2 represents the ensemble of collaborative filtering and content filtering models)</sup>


    
    