In [1]:
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import RidgeCV
from sklearn import preprocessing

from scipy import sparse

import pickle

from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
pd.set_option('display.width', 15000)
pd.set_option('display.max_columns', 100)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_context('poster')
%matplotlib inline

In [2]:
from surprise import Dataset, Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, \
SlopeOne, CoClustering, KNNBasic, KNNWithMeans, KNNBaseline

In [3]:
from recommender import plot_cm, IO
from recommender import ModeClassifier, BaselineMean, BaselineRegression, ALS1, ALS2, RS_surprise

In [5]:
%%time
# Load data

figsize = (7.5, 6)
fig_dir = 'figs/EDA/Full/'

data_dir = 'data/Full/'
dfb = pd.read_csv(data_dir + 'business.csv')
dfr = pd.read_csv(data_dir + 'review.csv')
dfu = pd.read_csv(data_dir + 'user.csv')
datar = pd.read_csv(data_dir + 'data_review.csv')

CPU times: user 57 s, sys: 12.7 s, total: 1min 9s
Wall time: 1min 24s


In [6]:
df_br = datar.merge(dfb, left_on="business_id", right_on="business_id")
df_br.head()

Unnamed: 0,user_id,business_id,stars_x,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,hours.Sunday,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,address,hours.Tuesday,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,neighborhood,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.WiFi,hours.Saturday,attributes.GoodForDancing,attributes.AgesAllowed,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,state,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,categories,attributes.Ambience.classy,hours.Friday,attributes.NoiseLevel,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,city,attributes.OutdoorSeating,hours.Wednesday,attributes.HasTV,attributes.BestNights.monday,attributes.HappyHour,attributes.GoodForMeal.latenight,attributes.DietaryRestrictions.soy-free,attributes.DietaryRestrictions.vegan,attributes.Caters,hours.Thursday,attributes.RestaurantsGoodForGroups,attributes.RestaurantsDelivery,attributes.Smoking,attributes.BusinessAcceptsBitcoin,attributes.RestaurantsTakeOut,attributes.BestNights.wednesday,attributes.BusinessParking.valet,attributes.BestNights.friday,hours.Monday,attributes.BYOB,attributes.GoodForMeal.lunch,attributes.GoodForMeal.breakfast,name,longitude,attributes.Music.video,attributes.Ambience.touristy,attributes.RestaurantsTableService,attributes.RestaurantsCounterService,attributes.BYOBCorkage,attributes.Music.no_music,postal_code,attributes.RestaurantsAttire,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,review_count,attributes.Alcohol,latitude
0,0,0,2,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',,b'10:00-21:00',,,,,,,,b'10:00-21:00',,,,,,,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716
1,1,0,2,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',,b'10:00-21:00',,,,,,,,b'10:00-21:00',,,,,,,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716
2,2,0,2,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',,b'10:00-21:00',,,,,,,,b'10:00-21:00',,,,,,,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716
3,3,0,3,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',,b'10:00-21:00',,,,,,,,b'10:00-21:00',,,,,,,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716
4,4,0,2,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',,b'10:00-21:00',,,,,,,,b'10:00-21:00',,,,,,,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716


In [7]:
df_ubr = df_br.merge(dfu, left_on="user_id", right_on="user_id")
df_ubr.head()

Unnamed: 0,user_id,business_id,stars_x,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,hours.Sunday,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,address,hours.Tuesday,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,neighborhood,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.WiFi,hours.Saturday,attributes.GoodForDancing,attributes.AgesAllowed,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,state,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,categories,attributes.Ambience.classy,hours.Friday,attributes.NoiseLevel,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,city,...,attributes.BusinessParking.valet,attributes.BestNights.friday,hours.Monday,attributes.BYOB,attributes.GoodForMeal.lunch,attributes.GoodForMeal.breakfast,name_x,longitude,attributes.Music.video,attributes.Ambience.touristy,attributes.RestaurantsTableService,attributes.RestaurantsCounterService,attributes.BYOBCorkage,attributes.Music.no_music,postal_code,attributes.RestaurantsAttire,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,review_count_x,attributes.Alcohol,latitude,yelping_since,compliment_profile,name_y,compliment_cute,average_stars,compliment_hot,review_count_y,compliment_plain,funny,fans,compliment_note,elite,compliment_funny,compliment_list,compliment_more,useful,friends,compliment_writer,cool,compliment_cool,compliment_photos
0,0,0,2,,,False,,,b'11:00-18:00',,,,,,1,b'691 Richmond Rd',b'10:00-21:00',,True,,,,True,b'',,,2.0,,,b'10:00-21:00',,,True,,2.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,False,b'Richmond Heights',...,False,,b'10:00-21:00',,,,b'Richmond Town Square',-81.493116,,,,,,,b'44143',,,,,,,,,,,,17,,41.541716,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
1,0,1729,4,,,False,,,b'11:00-18:00',,,,,,1,b'189 Crocker Park Blvd',b'10:00-21:00',,False,,,,True,b'',,,4.0,,,b'10:00-21:00',,,True,,3.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,b'10:00-21:00',,,,,True,b'Westlake',...,False,,b'10:00-21:00',,,,b'Crocker Park',-81.951925,,,,,,,b'44145',,,,,,,,,,,,87,,41.459654,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
2,0,2452,2,False,,False,False,False,,,,False,,True,1,b'2100 Superior Viaduct',,,False,,,,True,b'West Bank',False,,4.0,,b'free',,,,,True,3.0,False,b'OH',False,,,,"['Restaurants', 'Italian']",True,,b'average',,,,False,b'Cleveland',...,True,,,,False,False,b'Luca Italian Cuisine',-81.704727,,False,True,,,,b'44113',b'dressy',False,,,False,False,,True,,False,,115,b'full_bar',41.494287,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
3,0,4054,2,,,False,,,,,,,,,1,b'2101 Richmond Rd',,,False,,,,,b'',,,2.5,,,,,,True,,3.0,False,b'OH',,,,,"['Shopping', 'Shopping Centers']",,,,,,,False,b'Beachwood',...,False,,,,,,b'La Place Center',-81.496796,,,,,,,b'44122',,,,,,,,,,,,4,,41.500423,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
4,0,11850,3,,,,,True,b'12:00-22:00',,,,,,1,b'13116 Shaker Sq',b'12:00-22:00',,,,,,,b'',,,3.5,,,b'12:00-22:00',,,,,,,b'OH',,,,,"['Arts & Entertainment', 'Cinema']",,b'12:00-22:00',,,,,,b'Cleveland',...,,,b'12:00-22:00',,,,b'Shaker Square Cinemas',-81.591387,,,,,,,b'44120',,,,,,,,,,,,21,,41.483219,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11


In [8]:
y = df_ubr['stars_x'].values

In [9]:
col_dummy = ['attributes.Alcohol', 'attributes.AgesAllowed', 'attributes.NoiseLevel', \
             'attributes.WiFi', 'attributes.Smoking', 'attributes.RestaurantsAttire']

In [10]:
col_boolean = ['attributes.RestaurantsDelivery', 'attributes.DogsAllowed', 'attributes.BYOB', \
              'attributes.RestaurantsTableService', 'attributes.RestaurantsCounterService', \
              'attributes.Corkage', 'attributes.BusinessAcceptsBitcoin', 'attributes.WheelchairAccessible', \
              'attributes.BusinessAcceptsCreditCards', 'attributes.BusinessParking.lot', 'attributes.DriveThru', \
              'attributes.HasTV', 'attributes.BusinessParking.street', 'attributes.AcceptsInsurance', \
              'attributes.BusinessParking.valet', 'attributes.BYOBCorkage', 'attributes.BusinessParking.garage', \
              'attributes.ByAppointmentOnly', 'attributes.Caters', 'attributes.RestaurantsReservations', \
              'attributes.RestaurantsTakeOut', 'attributes.BikeParking', 'attributes.OutdoorSeating',\
              'attributes.BusinessParking.validated']

In [11]:
df_ubr = df_ubr.fillna(0)
df_ubr.head()

Unnamed: 0,user_id,business_id,stars_x,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,hours.Sunday,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,address,hours.Tuesday,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,neighborhood,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.WiFi,hours.Saturday,attributes.GoodForDancing,attributes.AgesAllowed,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,state,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,categories,attributes.Ambience.classy,hours.Friday,attributes.NoiseLevel,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,city,...,attributes.BusinessParking.valet,attributes.BestNights.friday,hours.Monday,attributes.BYOB,attributes.GoodForMeal.lunch,attributes.GoodForMeal.breakfast,name_x,longitude,attributes.Music.video,attributes.Ambience.touristy,attributes.RestaurantsTableService,attributes.RestaurantsCounterService,attributes.BYOBCorkage,attributes.Music.no_music,postal_code,attributes.RestaurantsAttire,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,review_count_x,attributes.Alcohol,latitude,yelping_since,compliment_profile,name_y,compliment_cute,average_stars,compliment_hot,review_count_y,compliment_plain,funny,fans,compliment_note,elite,compliment_funny,compliment_list,compliment_more,useful,friends,compliment_writer,cool,compliment_cool,compliment_photos
0,0,0,2,0,0,False,0,0,b'11:00-18:00',0,0,0,0,0,1,b'691 Richmond Rd',b'10:00-21:00',0,True,0,0,0,True,b'',0,0,2.0,0,0,b'10:00-21:00',0,0,True,0,2.0,False,b'OH',0,0,0,0,"['Shopping', 'Shopping Centers']",0,b'10:00-21:00',0,0,0,0,False,b'Richmond Heights',...,False,0,b'10:00-21:00',0,0,0,b'Richmond Town Square',-81.493116,0,0,0,0,0,0,b'44143',0,0,0,0,0,0,0,0,0,0,0,17,0,41.541716,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
1,0,1729,4,0,0,False,0,0,b'11:00-18:00',0,0,0,0,0,1,b'189 Crocker Park Blvd',b'10:00-21:00',0,False,0,0,0,True,b'',0,0,4.0,0,0,b'10:00-21:00',0,0,True,0,3.0,False,b'OH',0,0,0,0,"['Shopping', 'Shopping Centers']",0,b'10:00-21:00',0,0,0,0,True,b'Westlake',...,False,0,b'10:00-21:00',0,0,0,b'Crocker Park',-81.951925,0,0,0,0,0,0,b'44145',0,0,0,0,0,0,0,0,0,0,0,87,0,41.459654,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
2,0,2452,2,False,0,False,False,False,0,0,0,False,0,True,1,b'2100 Superior Viaduct',0,0,False,0,0,0,True,b'West Bank',False,0,4.0,0,b'free',0,0,0,0,True,3.0,False,b'OH',False,0,0,0,"['Restaurants', 'Italian']",True,0,b'average',0,0,0,False,b'Cleveland',...,True,0,0,0,False,False,b'Luca Italian Cuisine',-81.704727,0,False,True,0,0,0,b'44113',b'dressy',False,0,0,False,False,0,True,0,False,0,115,b'full_bar',41.494287,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
3,0,4054,2,0,0,False,0,0,0,0,0,0,0,0,1,b'2101 Richmond Rd',0,0,False,0,0,0,0,b'',0,0,2.5,0,0,0,0,0,True,0,3.0,False,b'OH',0,0,0,0,"['Shopping', 'Shopping Centers']",0,0,0,0,0,0,False,b'Beachwood',...,False,0,0,0,0,0,b'La Place Center',-81.496796,0,0,0,0,0,0,b'44122',0,0,0,0,0,0,0,0,0,0,0,4,0,41.500423,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11
4,0,11850,3,0,0,0,0,True,b'12:00-22:00',0,0,0,0,0,1,b'13116 Shaker Sq',b'12:00-22:00',0,0,0,0,0,0,b'',0,0,3.5,0,0,b'12:00-22:00',0,0,0,0,0.0,0,b'OH',0,0,0,0,"['Arts & Entertainment', 'Cinema']",0,b'12:00-22:00',0,0,0,0,0,b'Cleveland',...,0,0,b'12:00-22:00',0,0,0,b'Shaker Square Cinemas',-81.591387,0,0,0,0,0,0,b'44120',0,0,0,0,0,0,0,0,0,0,0,21,0,41.483219,b'2013-10-04',0,b'Lina',1,3.88,15,144,52,95,22,18,"[2016, 2014, 2015, 2013]",49,0,6,239,"['KVtNIIBxn10GVjPjpzv-OA', 'glmU056KJUV2nXfysJ...",9,63,49,11


In [12]:
col_drop = ['user_id', 'business_id', 'stars_x', 'postal_code', 'latitude', 'categories', 'name_x', \
'neighborhood', 'review_count_x', 'state', 'address', 'hours.Sunday', 'hours.Monday','hours.Tuesday','hours.Wednesday','hours.Thursday','hours.Friday','hours.Saturday',\
'longitude', 'elite', 'friends', 'name_y', 'city']

## uncertain: neighborhood, state

df_ubr = df_ubr.drop(col_drop, 1)
df_ubr.head()

Unnamed: 0,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.WiFi,attributes.GoodForDancing,attributes.AgesAllowed,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,attributes.Ambience.classy,attributes.NoiseLevel,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,attributes.OutdoorSeating,attributes.HasTV,attributes.BestNights.monday,attributes.HappyHour,attributes.GoodForMeal.latenight,attributes.DietaryRestrictions.soy-free,attributes.DietaryRestrictions.vegan,attributes.Caters,attributes.RestaurantsGoodForGroups,attributes.RestaurantsDelivery,attributes.Smoking,attributes.BusinessAcceptsBitcoin,attributes.RestaurantsTakeOut,attributes.BestNights.wednesday,attributes.BusinessParking.valet,attributes.BestNights.friday,attributes.BYOB,attributes.GoodForMeal.lunch,attributes.GoodForMeal.breakfast,attributes.Music.video,attributes.Ambience.touristy,attributes.RestaurantsTableService,attributes.RestaurantsCounterService,attributes.BYOBCorkage,attributes.Music.no_music,attributes.RestaurantsAttire,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,attributes.Alcohol,yelping_since,compliment_profile,compliment_cute,average_stars,compliment_hot,review_count_y,compliment_plain,funny,fans,compliment_note,compliment_funny,compliment_list,compliment_more,useful,compliment_writer,cool,compliment_cool,compliment_photos
0,0,0,False,0,0,0,0,0,0,0,1,0,True,0,0,0,True,0,0,2.0,0,0,0,0,True,0,2.0,False,0,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11
1,0,0,False,0,0,0,0,0,0,0,1,0,False,0,0,0,True,0,0,4.0,0,0,0,0,True,0,3.0,False,0,0,0,0,0,0,0,0,0,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11
2,False,0,False,False,False,0,0,False,0,True,1,0,False,0,0,0,True,False,0,4.0,0,b'free',0,0,0,True,3.0,False,False,0,0,0,True,b'average',0,0,0,False,True,False,0,0,False,0,0,False,True,False,0,0,True,0,True,0,0,False,False,0,False,True,0,0,0,b'dressy',False,0,0,False,False,0,True,0,False,0,b'full_bar',b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11
3,0,0,False,0,0,0,0,0,0,0,1,0,False,0,0,0,0,0,0,2.5,0,0,0,0,True,0,3.0,False,0,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11
4,0,0,0,0,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3.5,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11


In [13]:
# def fill_0(df_ubr, columns):
#     for column in columns:
#         df_ubr[column] = df_ubr[column].fillna(0)

In [14]:
# columns = ['attributes.Alcohol', 'attributes.AgesAllowed']
# fill_0(df_ubr, columns)
# df_ubr.head()

In [15]:
df_ubr = pd.get_dummies(df_ubr, columns=col_dummy, drop_first=True)
df_ubr.head()

Unnamed: 0,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.GoodForDancing,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,attributes.Ambience.classy,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,attributes.OutdoorSeating,attributes.HasTV,attributes.BestNights.monday,attributes.HappyHour,attributes.GoodForMeal.latenight,attributes.DietaryRestrictions.soy-free,attributes.DietaryRestrictions.vegan,attributes.Caters,attributes.RestaurantsGoodForGroups,attributes.RestaurantsDelivery,attributes.BusinessAcceptsBitcoin,attributes.RestaurantsTakeOut,attributes.BestNights.wednesday,attributes.BusinessParking.valet,attributes.BestNights.friday,...,attributes.BYOBCorkage,attributes.Music.no_music,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,yelping_since,compliment_profile,compliment_cute,average_stars,compliment_hot,review_count_y,compliment_plain,funny,fans,compliment_note,compliment_funny,compliment_list,compliment_more,useful,compliment_writer,cool,compliment_cool,compliment_photos,attributes.Alcohol_b'beer_and_wine',attributes.Alcohol_b'full_bar',attributes.Alcohol_b'none',attributes.AgesAllowed_b'18plus',attributes.AgesAllowed_b'19plus',attributes.AgesAllowed_b'21plus',attributes.AgesAllowed_b'allages',attributes.NoiseLevel_b'average',attributes.NoiseLevel_b'loud',attributes.NoiseLevel_b'quiet',attributes.NoiseLevel_b'very_loud',attributes.WiFi_b'free',attributes.WiFi_b'no',attributes.WiFi_b'paid',attributes.Smoking_b'no',attributes.Smoking_b'outdoor',attributes.Smoking_b'yes',attributes.RestaurantsAttire_b'casual',attributes.RestaurantsAttire_b'dressy',attributes.RestaurantsAttire_b'formal'
0,0,0,False,0,0,0,0,0,0,0,1,0,True,0,0,0,True,0,0,2.0,0,0,True,0,2.0,False,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,False,0,0,0,0,0,0,0,1,0,False,0,0,0,True,0,0,4.0,0,0,True,0,3.0,False,0,0,0,0,0,0,0,0,True,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,False,0,False,False,False,0,0,False,0,True,1,0,False,0,0,0,True,False,0,4.0,0,0,0,True,3.0,False,False,0,0,0,True,0,0,0,False,True,False,0,0,False,0,0,False,True,False,0,True,0,True,0,...,0,0,False,0,0,False,False,0,True,0,False,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0
3,0,0,False,0,0,0,0,0,0,0,1,0,False,0,0,0,0,0,0,2.5,0,0,True,0,3.0,False,0,0,0,0,0,0,0,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,False,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3.5,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
def alter_boolean(df_ubr, col_boolean):
    for column in col_boolean:
        df_ubr[column] = df_ubr[column].apply(lambda x: 1 if x == True else 0)

In [18]:
alter_boolean(df_ubr, col_boolean)
df_ubr.head()

Unnamed: 0,attributes.GoodForMeal.dessert,attributes.Music.jukebox,attributes.BusinessParking.street,attributes.Ambience.casual,attributes.GoodForKids,attributes.Music.karaoke,attributes.Music.live,attributes.Ambience.intimate,attributes.ByAppointmentOnly,attributes.GoodForMeal.dinner,is_open,attributes.AcceptsInsurance,attributes.BusinessParking.lot,attributes.BestNights.thursday,attributes.Open24Hours,attributes.DietaryRestrictions.gluten-free,attributes.BikeParking,attributes.Ambience.upscale,attributes.DietaryRestrictions.halal,stars_y,attributes.DriveThru,attributes.GoodForDancing,attributes.WheelchairAccessible,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.BusinessParking.validated,attributes.GoodForMeal.brunch,attributes.DietaryRestrictions.dairy-free,attributes.Music.background_music,attributes.CoatCheck,attributes.Ambience.classy,attributes.Corkage,attributes.BestNights.saturday,attributes.DogsAllowed,attributes.BusinessParking.garage,attributes.OutdoorSeating,attributes.HasTV,attributes.BestNights.monday,attributes.HappyHour,attributes.GoodForMeal.latenight,attributes.DietaryRestrictions.soy-free,attributes.DietaryRestrictions.vegan,attributes.Caters,attributes.RestaurantsGoodForGroups,attributes.RestaurantsDelivery,attributes.BusinessAcceptsBitcoin,attributes.RestaurantsTakeOut,attributes.BestNights.wednesday,attributes.BusinessParking.valet,attributes.BestNights.friday,...,attributes.BYOBCorkage,attributes.Music.no_music,attributes.Ambience.romantic,attributes.BestNights.sunday,attributes.Music.dj,attributes.Ambience.hipster,attributes.Ambience.divey,attributes.DietaryRestrictions.kosher,attributes.BusinessAcceptsCreditCards,attributes.BestNights.tuesday,attributes.Ambience.trendy,attributes.DietaryRestrictions.vegetarian,yelping_since,compliment_profile,compliment_cute,average_stars,compliment_hot,review_count_y,compliment_plain,funny,fans,compliment_note,compliment_funny,compliment_list,compliment_more,useful,compliment_writer,cool,compliment_cool,compliment_photos,attributes.Alcohol_b'beer_and_wine',attributes.Alcohol_b'full_bar',attributes.Alcohol_b'none',attributes.AgesAllowed_b'18plus',attributes.AgesAllowed_b'19plus',attributes.AgesAllowed_b'21plus',attributes.AgesAllowed_b'allages',attributes.NoiseLevel_b'average',attributes.NoiseLevel_b'loud',attributes.NoiseLevel_b'quiet',attributes.NoiseLevel_b'very_loud',attributes.WiFi_b'free',attributes.WiFi_b'no',attributes.WiFi_b'paid',attributes.Smoking_b'no',attributes.Smoking_b'outdoor',attributes.Smoking_b'yes',attributes.RestaurantsAttire_b'casual',attributes.RestaurantsAttire_b'dressy',attributes.RestaurantsAttire_b'formal'
0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,2.0,0,0,1,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,4.0,0,0,1,0,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,False,0,0,False,False,0,0,False,0,True,1,0,0,0,0,0,1,False,0,4.0,0,0,0,1,3.0,0,False,0,0,0,True,0,0,0,0,1,0,0,0,False,0,0,0,True,0,0,1,0,1,0,...,0,0,False,0,0,False,False,0,1,0,False,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2.5,0,0,1,0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3.5,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,b'2013-10-04',0,1,3.88,15,144,52,95,22,18,49,0,6,239,9,63,49,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
#df_all = pd.DataFrame()
df_ubr['yelping_since'] = df_ubr['yelping_since'].apply(lambda x: int(x[2:6]) - 2005)
df_ubr['stars_y'] = df_ubr['stars_y'].apply(lambda x: x)
df_ubr['attributes.Ambience.divey'] = df_ubr['attributes.Ambience.divey'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.casual'] = df_ubr['attributes.Ambience.casual'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.classy'] = df_ubr['attributes.Ambience.classy'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.hipster'] = df_ubr['attributes.Ambience.hipster'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.intimate'] = df_ubr['attributes.Ambience.intimate'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.romantic'] = df_ubr['attributes.Ambience.romantic'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.touristy'] = df_ubr['attributes.Ambience.touristy'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.trendy'] = df_ubr['attributes.Ambience.trendy'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Ambience.upscale'] = df_ubr['attributes.Ambience.upscale'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.friday'] = df_ubr['attributes.BestNights.friday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.monday'] = df_ubr['attributes.BestNights.monday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.thursday'] = df_ubr['attributes.BestNights.thursday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.tuesday'] = df_ubr['attributes.BestNights.tuesday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.wednesday'] = df_ubr['attributes.BestNights.wednesday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.saturday'] = df_ubr['attributes.BestNights.saturday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.BestNights.sunday'] = df_ubr['attributes.BestNights.sunday'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.dairy-free'] = df_ubr['attributes.DietaryRestrictions.dairy-free'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.halal'] = df_ubr['attributes.DietaryRestrictions.halal'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.kosher'] = df_ubr['attributes.DietaryRestrictions.kosher'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.soy-free'] = df_ubr['attributes.DietaryRestrictions.soy-free'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.vegan'] = df_ubr['attributes.DietaryRestrictions.vegan'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.DietaryRestrictions.vegetarian'] = df_ubr['attributes.DietaryRestrictions.vegetarian'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForDancing'] = df_ubr['attributes.GoodForDancing'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForKids'] = df_ubr['attributes.GoodForKids'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForMeal.lunch'] = df_ubr['attributes.GoodForMeal.lunch'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForMeal.brunch'] = df_ubr['attributes.GoodForMeal.brunch'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForMeal.dinner'] = df_ubr['attributes.GoodForMeal.dinner'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.GoodForMeal.latenight'] = df_ubr['attributes.GoodForMeal.latenight'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.RestaurantsGoodForGroups'] = df_ubr['attributes.RestaurantsGoodForGroups'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.background_music'] = df_ubr['attributes.Music.background_music'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.dj'] = df_ubr['attributes.Music.dj'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.jukebox'] = df_ubr['attributes.Music.jukebox'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.karaoke'] = df_ubr['attributes.Music.karaoke'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.live'] = df_ubr['attributes.Music.live'].apply(lambda x: 1 if x == True else 0)
df_ubr['attributes.Music.video'] = df_ubr['attributes.Music.video'].apply(lambda x: 1 if x == True else 0)


In [20]:
np.random.seed(6006)
msk = np.random.rand(len(df_ubr)) < 0.5 
X = df_ubr.iloc[:,:].values
X_train = X[msk]
y_train = y[msk]
X_test = X[~msk]
y_test = y[~msk]

In [21]:
std_scale = preprocessing.StandardScaler().fit(X_train)

X_train = std_scale.transform(X_train)
X_test = std_scale.transform(X_test)



In [22]:
clf = RidgeCV(fit_intercept=True, normalize=True)
clf.fit(X_train, y_train)
print('Ridge Train Score', clf.score(X_train, y_train)) 
print('Ridge Test Score', clf.score(X_test, y_test))

Ridge Train Score 0.414401997864
Ridge Test Score 0.415773403987


In [23]:
import sklearn
from sklearn.metrics import mean_squared_error
sklearn.metrics.mean_squared_error(clf.predict(X_test), y_test)

1.1825931213672025