In [1]:
import sys
import traceback
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LogisticRegression, LogisticRegressionCV

from scipy import sparse

import pickle

from IPython.display import display, HTML, Markdown

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
pd.set_option('display.width', 15000)
pd.set_option('display.max_columns', 100)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_context('poster')
%matplotlib inline

from surprise import Dataset, Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, \
SlopeOne, CoClustering, KNNBasic, KNNWithMeans, KNNBaseline

from recommender import plot_cm, get_results, show_results, IO, show_summaries, get_X
from recommender import ModeClassifier, BaselineMean, BaselineRegression, ALS1, ALS2, RS_surprise, RS_sklearn

In [2]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    print(city + '...')
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)

    dfb = pd.read_pickle(data_dir + 'business.pkl')
    dfu = pd.read_pickle(data_dir + 'user.pkl')

    X_train, y_train, X_test, y_test, X_cv, y_cv = IO(data_dir + 'data_split.pkl').read_pickle()

    X_train = get_X(X_train, dfb, dfu)
    X_test = get_X(X_test, dfb, dfu)
    X_cv = get_X(X_cv, dfb, dfu)

    data_split = [X_train, y_train, X_test, y_test, X_cv, y_cv]

    IO(data_dir + '05_data_split.pkl').to_pickle(data_split)

    del dfb
    del dfu
    del data_split
    print(city + ' preprocessing successful.')
    print()
    

Champaign...
Champaign preprocessing successful.

Cleveland...
Cleveland preprocessing successful.

Pittsburgh...
Pittsburgh preprocessing successful.

Toronto...
Toronto preprocessing successful.

Las_Vegas...
Las_Vegas preprocessing successful.

Full...
Full preprocessing successful.

Wall time: 4min 26s


In [2]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    print(city + '...')
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)
    
    models = [RS_sklearn(RidgeCV()), RS_sklearn(LogisticRegressionCV(class_weight='balanced'))]
    model_names = ['Ridge regression', 'Logistic regression']
    test = [True, True]
    datanames = [data_dir + 'results05/' + str(i) + '.pkl' for i in range(len(models))]

    IO(data_dir + 'results05/models.pkl').to_pickle(models)
    IO(data_dir + 'results05/model_names.pkl').to_pickle(model_names)
    IO(data_dir + 'results05/datanames.pkl').to_pickle(datanames)

Champaign...
Cleveland...
Pittsburgh...
Toronto...
Las_Vegas...
Full...
Wall time: 17 ms


In [None]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    print(city + '...')
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)

    is_successful = []
    
    datanames = IO(data_dir + 'results05/datanames.pkl').read_pickle()
    models = IO(data_dir + 'results05/models.pkl').read_pickle()
    model_names = IO(data_dir + 'results05/model_names.pkl').read_pickle()
    X_train, y_train, X_test, y_test, X_cv, y_cv = IO(data_dir + '05_data_split.pkl').read_pickle()
    
    for i, model in enumerate(models):
        try:
            print(model_names[i] + '...')
            if not test[i]:
                print('Estimator not tested')
                is_successful.append(False)
                print()
                continue
            model.fit(X_train, y_train)
            print(model_names[i] + ' fitting successful.')
            model.cv_r2 = model.score(X_cv, y_cv, scoring='r2')
            print(model_names[i] + ' cv r2 calculation successful.')
            try:
                IO(datanames[i]).to_pickle(model)
                print('Saving to pickle successful.')
            except:
                traceback.print_exc()
                print('Saving to pickle failed.')
            del model
            is_successful.append(True)
            print()
        except:
            traceback.print_exc()
            print(model_names[i] + ' failed.')
            is_successful.append(False)
            print()

    IO(data_dir + 'results05/is_successful.pkl').to_pickle(is_successful)

Champaign...
Ridge regression...
Ridge regression fitting successful.
Ridge regression cv r2 calculation successful.
Saving to pickle successful.

Logistic regression...
Logistic regression fitting successful.
Logistic regression cv r2 calculation successful.
Saving to pickle successful.

Cleveland...
Ridge regression...
Ridge regression fitting successful.
Ridge regression cv r2 calculation successful.
Saving to pickle successful.

Logistic regression...
Logistic regression fitting successful.
Logistic regression cv r2 calculation successful.
Saving to pickle successful.

Pittsburgh...
Ridge regression...
Ridge regression fitting successful.
Ridge regression cv r2 calculation successful.
Saving to pickle successful.

Logistic regression...
Logistic regression fitting successful.
Logistic regression cv r2 calculation successful.
Saving to pickle successful.

Toronto...
Ridge regression...
Ridge regression fitting successful.
Ridge regression cv r2 calculation successful.
Saving to pick

In [None]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    print(city + '...')
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)

    is_successful = IO(data_dir + 'results05/is_successful.pkl').read_pickle()
    datanames = IO(data_dir + 'results05/datanames.pkl').read_pickle()
    model_names = IO(data_dir + 'results05/model_names.pkl').read_pickle()
    X_train, y_train, X_test, y_test, X_cv, y_cv = IO(data_dir + '05_data_split.pkl').read_pickle()

    results = []
    for i in range(len(is_successful)):
        print(model_names[i] + '...')
        if not is_successful[i]:
            results.append(None)
        else:
            model = IO(datanames[i]).read_pickle()
            results.append(get_results(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, \
                                       X_cv=X_cv, y_cv=y_cv))
            del model
        
    print('Done.')
    IO(data_dir + 'results05/results.pkl').to_pickle(results)
    print()

In [None]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    print(city + '...')
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)
    model_names = IO(data_dir + 'results05/model_names.pkl').read_pickle()
    results = IO(data_dir + 'results05/results.pkl').read_pickle()
    is_successful = IO(data_dir + 'results05/is_successful.pkl').read_pickle()
    sizes = IO(data_dir + 'sizes.pkl').read_pickle()

    display(Markdown('## {} <sup>({} reviews, {} restaurants, {} users)</sup>'.\
                     format(city, sizes[0], sizes[1], sizes[2])))
    display(Markdown('**Content filtering**'))
    show_summaries(model_names, results, is_successful)


In [None]:
%%time

cities = ['Champaign', 'Cleveland', 'Pittsburgh', 'Toronto', 'Las_Vegas', 'Full']

for city in cities:
    display(Markdown('## ' + city))
    fig_dir = 'figs/modeling/{}/'.format(city)
    data_dir = 'data/{}/'.format(city)

    is_successful = IO(data_dir + 'results05/is_successful.pkl').read_pickle()
    datanames = IO(data_dir + 'results05/datanames.pkl').read_pickle()
    model_names = IO(data_dir + 'results05/model_names.pkl').read_pickle()
    results = IO(data_dir + 'results05/results.pkl').read_pickle()
    X_train, y_train, X_test, y_test, X_cv, y_cv = IO(data_dir + '05_data_split.pkl').read_pickle()

    for i in range(len(is_successful)):
        if is_successful[i]:
            model = IO(datanames[i]).read_pickle()
            show_results(model, model_names[i], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, \
                         results=results[i], show_cv=True)
            del model