In [46]:
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import RidgeCV
from sklearn import preprocessing

from scipy import sparse

import pickle

from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
pd.set_option('display.width', 15000)
pd.set_option('display.max_columns', 100)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_context('poster')
%matplotlib inline

In [2]:
from surprise import Dataset, Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, \
SlopeOne, CoClustering, KNNBasic, KNNWithMeans, KNNBaseline

In [3]:
from recommender import plot_cm, print_results, IO
from recommender import ModeClassifier, BaselineMean, BaselineRegression, ALS1, ALS2, RS_surprise

In [4]:
%%time
# Load data

figsize = (7.5, 6)
fig_dir = 'figs/modeling/Champaign/'

data_dir = 'data/Champaign/'
dfb = pd.read_pickle(data_dir + 'business.pkl')
dfr = pd.read_pickle(data_dir + 'review.pkl')
dfu = pd.read_pickle(data_dir + 'user.pkl')
datar = pd.read_pickle(data_dir + 'data_review.pkl')

CPU times: user 39.9 ms, sys: 31.9 ms, total: 71.8 ms
Wall time: 131 ms


In [5]:
df_br = datar.merge(dfb, left_on="business_id", right_on="business_id")
df_br.head()

Unnamed: 0,user_id,business_id,stars_x,attributes.Ambience.divey,attributes.RestaurantsDelivery,attributes.DogsAllowed,postal_code,hours.Thursday,attributes.BestNights.sunday,attributes.BYOB,attributes.AgesAllowed,attributes.Music.video,hours.Friday,latitude,attributes.Alcohol,attributes.Ambience.classy,attributes.RestaurantsTableService,attributes.Ambience.touristy,attributes.RestaurantsCounterService,attributes.Corkage,attributes.RestaurantsGoodForGroups,categories,name,attributes.BusinessAcceptsBitcoin,attributes.HappyHour,attributes.WheelchairAccessible,attributes.Ambience.hipster,attributes.BusinessAcceptsCreditCards,is_open,attributes.DietaryRestrictions.vegetarian,attributes.Music.live,attributes.Music.background_music,neighborhood,attributes.BusinessParking.lot,attributes.Music.karaoke,review_count,attributes.GoodForMeal.breakfast,attributes.NoiseLevel,state,attributes.DriveThru,attributes.HasTV,attributes.GoodForMeal.dinner,attributes.BusinessParking.street,address,attributes.RestaurantsAttire,hours.Sunday,attributes.BestNights.tuesday,attributes.AcceptsInsurance,attributes.BestNights.wednesday,hours.Wednesday,attributes.Open24Hours,attributes.Ambience.trendy,attributes.CoatCheck,hours.Monday,city,attributes.Music.no_music,hours.Tuesday,stars_y,attributes.RestaurantsPriceRange2,attributes.Ambience.intimate,attributes.GoodForMeal.latenight,attributes.GoodForMeal.dessert,attributes.BusinessParking.validated,attributes.GoodForMeal.lunch,attributes.GoodForKids,attributes.DietaryRestrictions.soy-free,attributes.GoodForMeal.brunch,attributes.BusinessParking.valet,longitude,attributes.DietaryRestrictions.gluten-free,attributes.BYOBCorkage,attributes.BusinessParking.garage,attributes.BestNights.friday,hours.Saturday,attributes.Music.dj,attributes.BestNights.saturday,attributes.Ambience.casual,attributes.BestNights.thursday,attributes.BestNights.monday,attributes.DietaryRestrictions.kosher,attributes.WiFi,attributes.Smoking,attributes.DietaryRestrictions.halal,attributes.GoodForDancing,attributes.ByAppointmentOnly,attributes.Caters,attributes.RestaurantsReservations,attributes.DietaryRestrictions.dairy-free,attributes.DietaryRestrictions.vegan,attributes.Ambience.romantic,attributes.Music.jukebox,attributes.Ambience.upscale,attributes.RestaurantsTakeOut,attributes.BikeParking,attributes.OutdoorSeating
0,1305,44,4,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,,,,,Champaign,,,1.5,,,,,,,,,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,
1,1306,44,1,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,,,,,Champaign,,,1.5,,,,,,,,,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,
2,1307,44,1,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,,,,,Champaign,,,1.5,,,,,,,,,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,
3,1308,44,1,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,,,,,Champaign,,,1.5,,,,,,,,,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,
4,1309,44,1,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,,,,,Champaign,,,1.5,,,,,,,,,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,


In [6]:
df_ubr = df_br.merge(dfu, left_on="user_id", right_on="user_id")
df_ubr.head()

Unnamed: 0,user_id,business_id,stars_x,attributes.Ambience.divey,attributes.RestaurantsDelivery,attributes.DogsAllowed,postal_code,hours.Thursday,attributes.BestNights.sunday,attributes.BYOB,attributes.AgesAllowed,attributes.Music.video,hours.Friday,latitude,attributes.Alcohol,attributes.Ambience.classy,attributes.RestaurantsTableService,attributes.Ambience.touristy,attributes.RestaurantsCounterService,attributes.Corkage,attributes.RestaurantsGoodForGroups,categories,name_x,attributes.BusinessAcceptsBitcoin,attributes.HappyHour,attributes.WheelchairAccessible,attributes.Ambience.hipster,attributes.BusinessAcceptsCreditCards,is_open,attributes.DietaryRestrictions.vegetarian,attributes.Music.live,attributes.Music.background_music,neighborhood,attributes.BusinessParking.lot,attributes.Music.karaoke,review_count_x,attributes.GoodForMeal.breakfast,attributes.NoiseLevel,state,attributes.DriveThru,attributes.HasTV,attributes.GoodForMeal.dinner,attributes.BusinessParking.street,address,attributes.RestaurantsAttire,hours.Sunday,attributes.BestNights.tuesday,attributes.AcceptsInsurance,attributes.BestNights.wednesday,hours.Wednesday,...,attributes.GoodForMeal.brunch,attributes.BusinessParking.valet,longitude,attributes.DietaryRestrictions.gluten-free,attributes.BYOBCorkage,attributes.BusinessParking.garage,attributes.BestNights.friday,hours.Saturday,attributes.Music.dj,attributes.BestNights.saturday,attributes.Ambience.casual,attributes.BestNights.thursday,attributes.BestNights.monday,attributes.DietaryRestrictions.kosher,attributes.WiFi,attributes.Smoking,attributes.DietaryRestrictions.halal,attributes.GoodForDancing,attributes.ByAppointmentOnly,attributes.Caters,attributes.RestaurantsReservations,attributes.DietaryRestrictions.dairy-free,attributes.DietaryRestrictions.vegan,attributes.Ambience.romantic,attributes.Music.jukebox,attributes.Ambience.upscale,attributes.RestaurantsTakeOut,attributes.BikeParking,attributes.OutdoorSeating,yelping_since,useful,compliment_photos,compliment_list,compliment_funny,compliment_plain,review_count_y,elite,fans,compliment_note,funny,compliment_writer,compliment_cute,average_stars,compliment_more,friends,compliment_hot,cool,name_y,compliment_profile,compliment_cool
0,1305,44,4,,,,61820,,,,,,,40.112407,,,,,,,,"[u'Property Management', u'Real Estate', u'Apa...",Professional Property Management,,,,,False,1,,,,,,,10,,,IL,,,,,401 E Springfield Ave,,,,,,,...,,,-88.233285,,,,,,,,,,,,,,,,False,,,,,,,,,,,2014-08-13,63,0,0,0,1,189,[],3,0,11,0,0,3.84,0,"[u'Azi0eHGZaVIZCgDuCzRlGw', u'7bmSgEJk3B4Bkt0E...",0,4,Francisco,0,0
1,1305,25959,5,False,False,,61820,11:00-21:00,,,,,11:00-21:00,40.110409,none,False,False,False,,,False,"[u'Restaurants', u'Mexican']",Maize Mexican Grill,,,,False,True,1,,,,,True,,448,False,loud,IL,,False,True,False,60 E Green St,casual,11:00-21:00,,,,11:00-21:00,...,False,False,-88.238955,,,False,,11:00-21:00,,,True,,,,no,,,,,True,False,,,False,,False,True,True,False,2014-08-13,63,0,0,0,1,189,[],3,0,11,0,0,3.84,0,"[u'Azi0eHGZaVIZCgDuCzRlGw', u'7bmSgEJk3B4Bkt0E...",0,4,Francisco,0,0
2,1305,59088,4,False,False,,61820,17:00-21:30,,False,,,16:30-22:30,40.118531,beer_and_wine,False,True,False,,,True,"[u'Sushi Bars', u'Restaurants', u'Japanese']",Sushi Kame,,,True,False,True,1,,,,,False,,131,False,quiet,IL,,False,True,True,132 W Church St,casual,12:00-21:00,,,,17:00-21:30,...,False,False,-88.244803,,no,False,,12:00-22:30,,,True,,,,no,,,,,False,True,,,False,,False,True,True,False,2014-08-13,63,0,0,0,1,189,[],3,0,11,0,0,3.84,0,"[u'Azi0eHGZaVIZCgDuCzRlGw', u'7bmSgEJk3B4Bkt0E...",0,4,Francisco,0,0
3,1305,68537,1,,,,61820,,,,,,,40.108412,,,,,,,,"[u'General Dentistry', u'Health & Medical', u'...",Champaign Dental Group,,,,,True,1,,,,,,,37,,,IL,,,,,703 S Neil St,,,,True,,,...,,,-88.243987,,,,,,,,,,,,,,,,True,,,,,,,,,,,2014-08-13,63,0,0,0,1,189,[],3,0,11,0,0,3.84,0,"[u'Azi0eHGZaVIZCgDuCzRlGw', u'7bmSgEJk3B4Bkt0E...",0,4,Francisco,0,0
4,1305,72429,2,,,,61822,9:00-19:00,,,,,9:00-19:00,40.139273,,,,,,,,"[u'Automotive', u'Car Dealers', u'Auto Repair']",The Auto Mall of Champaign,,,,,True,1,,,,,,,10,,,IL,,,,,1912 Moreland Blvd,,,,,,9:00-19:00,...,,,-88.249587,,,,,9:00-18:00,,,,,,,,,,,,,,,,,,,,,,2014-08-13,63,0,0,0,1,189,[],3,0,11,0,0,3.84,0,"[u'Azi0eHGZaVIZCgDuCzRlGw', u'7bmSgEJk3B4Bkt0E...",0,4,Francisco,0,0


In [7]:
y = df_ubr['stars_x'].values

In [61]:
df_all = pd.DataFrame()
df_all['lasting'] = df_ubr['yelping_since'].apply(lambda x: int(x[0:4]) - 2005)
df_all['stars_y'] = df_ubr['stars_y'].apply(lambda x: x)
df_all['ambience_divey'] = df_ubr['attributes.Ambience.divey'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_casual'] = df_ubr['attributes.Ambience.casual'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_classy'] = df_ubr['attributes.Ambience.classy'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_hipster'] = df_ubr['attributes.Ambience.hipster'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_intimate'] = df_ubr['attributes.Ambience.intimate'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_romantic'] = df_ubr['attributes.Ambience.romantic'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_touristy'] = df_ubr['attributes.Ambience.touristy'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_trendy'] = df_ubr['attributes.Ambience.trendy'].apply(lambda x: 1 if x == True else 0)
df_all['ambience_upscale'] = df_ubr['attributes.Ambience.upscale'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_5'] = df_ubr['attributes.BestNights.friday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_1'] = df_ubr['attributes.BestNights.monday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_4'] = df_ubr['attributes.BestNights.thursday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_2'] = df_ubr['attributes.BestNights.tuesday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_3'] = df_ubr['attributes.BestNights.wednesday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_6'] = df_ubr['attributes.BestNights.saturday'].apply(lambda x: 1 if x == True else 0)
df_all['bestnight_7'] = df_ubr['attributes.BestNights.sunday'].apply(lambda x: 1 if x == True else 0)
df_all['diary-free'] = df_ubr['attributes.DietaryRestrictions.dairy-free'].apply(lambda x: 1 if x == True else 0)
df_all['halal'] = df_ubr['attributes.DietaryRestrictions.halal'].apply(lambda x: 1 if x == True else 0)
df_all['kosher'] = df_ubr['attributes.DietaryRestrictions.kosher'].apply(lambda x: 1 if x == True else 0)
df_all['soy-free'] = df_ubr['attributes.DietaryRestrictions.soy-free'].apply(lambda x: 1 if x == True else 0)
df_all['vegan'] = df_ubr['attributes.DietaryRestrictions.vegan'].apply(lambda x: 1 if x == True else 0)
df_all['vegetarian'] = df_ubr['attributes.DietaryRestrictions.vegetarian'].apply(lambda x: 1 if x == True else 0)
df_all['good_dance'] = df_ubr['attributes.GoodForDancing'].apply(lambda x: 1 if x == True else 0)
df_all['good_kids'] = df_ubr['attributes.GoodForKids'].apply(lambda x: 1 if x == True else 0)
df_all['good_lunch'] = df_ubr['attributes.GoodForMeal.lunch'].apply(lambda x: 1 if x == True else 0)
df_all['good_brunch'] = df_ubr['attributes.GoodForMeal.brunch'].apply(lambda x: 1 if x == True else 0)
df_all['good_dinner'] = df_ubr['attributes.GoodForMeal.dinner'].apply(lambda x: 1 if x == True else 0)
df_all['good_latenight'] = df_ubr['attributes.GoodForMeal.latenight'].apply(lambda x: 1 if x == True else 0)
df_all['good_group'] = df_ubr['attributes.RestaurantsGoodForGroups'].apply(lambda x: 1 if x == True else 0)
df_all['background'] = df_ubr['attributes.Music.background_music'].apply(lambda x: 1 if x == True else 0)
df_all['dj'] = df_ubr['attributes.Music.dj'].apply(lambda x: 1 if x == True else 0)
df_all['jukebox'] = df_ubr['attributes.Music.jukebox'].apply(lambda x: 1 if x == True else 0)
df_all['karaoke'] = df_ubr['attributes.Music.karaoke'].apply(lambda x: 1 if x == True else 0)
df_all['live'] = df_ubr['attributes.Music.live'].apply(lambda x: 1 if x == True else 0)
df_all['video'] = df_ubr['attributes.Music.video'].apply(lambda x: 1 if x == True else 0)

df_all['accept_insurance'] = df_ubr['attributes.AcceptsInsurance'].apply(lambda x: 1 if x == True else 0)
df_all['drivu_thri'] = df_ubr['attributes.DriveThru'].apply(lambda x: 1 if x == True else 0)
df_all['bike_parking'] = df_ubr['attributes.BikeParking'].apply(lambda x: 1 if x == pd.isnull(x) else 0)

df_all['age_alloed'] = df_ubr['attributes.AgesAllowed'].apply(lambda x: 2 if x == '18plus' else x)
df_all['age_alloed'] = df_ubr['attributes.AgesAllowed'].apply(lambda x: 2 if x == '19plus' else x)
df_all['age_alloed'] = df_ubr['attributes.AgesAllowed'].apply(lambda x: 1 if x == '21plus' else x)
df_all['age_alloed'] = df_ubr['attributes.AgesAllowed'].apply(lambda x: 1 if x == 'allages' else 0)
df_all['wifi'] = df_ubr['attributes.WiFi'].apply(lambda x: 1 if x == 'paid' else 0)
df_all['attire'] = df_ubr['attributes.RestaurantsAttire'].apply(lambda x: 1 if x == 'formal' else 0)
df_all['noise'] = df_ubr['attributes.NoiseLevel'].apply(lambda x: 1 if x == 'very_loud' else 0)

df_all['fans'] = df_ubr['fans'].apply(lambda x: x)
df_all['review_count'] = df_ubr['review_count_y'].apply(lambda x: x)
df_all['average_stars'] = df_ubr['average_stars'].apply(lambda x: x)
df_all['useful'] = df_ubr['useful'].apply(lambda x: x)
df_all['funny'] = df_ubr['funny'].apply(lambda x: x)
df_all['cool'] = df_ubr['cool'].apply(lambda x: x)

df_all.head()

Unnamed: 0,lasting,stars_y,ambience_divey,ambience_casual,ambience_classy,ambience_hipster,ambience_intimate,ambience_romantic,ambience_touristy,ambience_trendy,ambience_upscale,bestnight_5,bestnight_1,bestnight_4,bestnight_2,bestnight_3,bestnight_6,bestnight_7,diary-free,halal,kosher,soy-free,vegan,vegetarian,good_dance,good_kids,good_lunch,good_brunch,good_dinner,good_latenight,good_group,background,dj,jukebox,karaoke,live,video,accept_insurance,drivu_thri,bike_parking,age_alloed,wifi,attire,noise,fans,review_count,average_stars,useful,funny,cool
0,9,1.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,189,3.84,63,11,4
1,9,4.5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,189,3.84,63,11,4
2,9,3.5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,189,3.84,63,11,4
3,9,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,189,3.84,63,11,4
4,9,1.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,189,3.84,63,11,4


In [62]:
np.random.seed(6006)
msk = np.random.rand(len(df_all)) < 0.5 
X = df_all.iloc[:,:].values
X_train = X[msk]
y_train = y[msk]
X_test = X[~msk]
y_test = y[~msk]

In [63]:
std_scale = preprocessing.StandardScaler().fit(X_train)

X_train = std_scale.transform(X_train)
X_test = std_scale.transform(X_test)

In [64]:
clf = RidgeCV(fit_intercept=True, normalize=True)
clf.fit(X_train, y_train)
print('Ridge Train Score', clf.score(X_train, y_train)) 
print('Ridge Test Score', clf.score(X_test, y_test))

Ridge Train Score 0.425350153343
Ridge Test Score 0.42856852666


In [65]:
import sklearn
from sklearn.metrics import mean_squared_error
sklearn.metrics.mean_squared_error(clf.predict(X_test), y_test)

1.1828155304936303