In [1]:
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from scipy import sparse

import pickle

from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
pd.set_option('display.width', 15000)
pd.set_option('display.max_columns', 100)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_context('poster')
%matplotlib inline

In [2]:
from surprise import Dataset, Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, \
SlopeOne, CoClustering, KNNBasic, KNNWithMeans, KNNBaseline

In [3]:
from recommender import plot_cm, print_results, IO
from recommender import ModeClassifier, BaselineMean, BaselineRegression, ALS1, ALS2, RS_surprise

In [4]:
%%time
# Load data

fig_dir = 'figs/modeling/Cleveland/'

data_dir = 'data/Cleveland/'
dfb = pd.read_pickle(data_dir + 'business.pkl')
dfr = pd.read_pickle(data_dir + 'review.pkl')
dfu = pd.read_pickle(data_dir + 'user.pkl')
datar = pd.read_pickle(data_dir + 'data_review.pkl')

Wall time: 186 ms


In [5]:
print(dfb.shape)
print(dfu.shape)
print(datar.shape)
datar.head()

(2500, 93)
(30131, 22)
(75932, 3)


Unnamed: 0,user_id,business_id,stars
0,172,12,5
1,173,12,4
2,174,12,4
3,175,12,1
4,176,12,1


In [6]:
# Split the dataset into a training set a test set

X_train, X_test, y_train, y_test = train_test_split(datar[['user_id', 'business_id']].values, datar['stars'].values, \
                                                   test_size=0.4, random_state=0)

In [7]:
models = [ModeClassifier(), RS_surprise(NormalPredictor()), BaselineMean(), BaselineRegression(), \
          RS_surprise(BaselineOnly()), RS_surprise(KNNBasic()), RS_surprise(KNNWithMeans()), RS_surprise(KNNBaseline()), \
          ALS1(), ALS2(), RS_surprise(SVD()), RS_surprise(SVDpp()), RS_surprise(NMF()), RS_surprise(SlopeOne()), \
          RS_surprise(CoClustering())]
model_names = ['Mode estimator', 'Normal predictor*', 'Baseline (mean)', 'Baseline (regression)', \
              'Baseline (ALS)*', 'KNN (basic)*', 'KNN (with means)*', 'KNN (baseline)*', \
              'SVD-ALS1', 'SVD-ALS2', 'SVD-SGD*', 'SVD++-SGD*', 'NMF-SGD*', 'Slope one*', 'Co-clustering*']
fignames = [fig_dir + str(i) + 'cm.png' for i in range(len(models))]

In [None]:
%%time

estimators = []
is_successful = []

for i, model in enumerate(models):
    try:
        model.fit(X_train, y_train)
        estimators.append(model)
        IO(data_dir + 'results/estimators.pkl').to_pickle(estimators)
        print(model_names[i] + ' successful.')
        is_successful.append(True)
    except:
        print(model_names[i] + ' failed.')
        is_successful.append(False)
        
IO(data_dir + 'results/is_successful.pkl').to_pickle(is_successful)

Mode estimator successful.
Normal predictor* successful.
Baseline (mean) successful.
Baseline (regression) successful.
Estimating biases using als...
Baseline (ALS)* successful.
Computing the msd similarity matrix...
Done computing similarity matrix.
KNN (basic)* successful.
Computing the msd similarity matrix...
Done computing similarity matrix.
KNN (with means)* successful.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
KNN (baseline)* successful.
SVD-ALS1 successful.


In [None]:
%%time

estimators = IO(data_dir + 'results/estimators.pkl').read_pickle()
is_successful = IO(data_dir + 'results/is_successful.pkl').read_pickle()

for i in range(len(estimators)):
    if is_successful[i]:
        print_results(estimators[i], model_names[i], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, \
                      figname=fignames[i])
        