# Yelp Data Challenge - Restaurant Recommender

BitTiger DS501

Nov 2017

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [65]:
df = pd.read_csv('yelp_dataset_challenge_round10/last_2_years_restaurant_reviews.csv')

In [66]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,1,2016-05-17,0,0Qc1THNHSapDL7cv-ZzW5g,5,What can I say.. Wowzers! Probably one of the ...,0,4LxKRRIikhr65GfPDW626w
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,0,2017-01-20,0,L8lo5SKXfZRlbn1bpPiC9w,5,Went here for guys weekend. Unbelievable. Ravi...,0,nT8zgjoc-PbdBoQsFEXFLw
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,52,2016-09-25,30,6eUT3IwwWPP3CZkAhxqOIw,5,"One word my friends: tableside!!! Yes, tablesi...",56,7RlyCglsIzhBn081inwvcg
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,1,2017-02-12,0,3cnTdE45VrsS0o4cVhfGog,3,"Located inside my favorite hotel Venetian, Del...",1,rOIrilMC7VFwFVBeQNiKMw
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,0,2016-10-30,0,tYrSbjX3QgZGBZuQ3n8g6w,5,"After the most incredible service, delicious m...",2,PiWlV_UC_-SXqyxQM9fAtw


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [59]:
# Get business_id, user_id, stars for recommender
selected_features = [u'business_id', u'user_id', u'stars']
df_recommender = df[selected_features]

In [60]:
df_recommender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325819 entries, 0 to 325818
Data columns (total 3 columns):
business_id    325819 non-null object
user_id        325819 non-null object
stars          325819 non-null int64
dtypes: int64(1), object(2)
memory usage: 7.5+ MB


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?
**A**: content based / Popularity-based recommend

In [61]:
# To be implemented
df_recommender.isnull().sum()

business_id    0
user_id        0
stars          0
dtype: int64

#### Create utility matrix from records

In [67]:
df_utility = pd.pivot_table(data=df_recommender,
                            values='stars', 
                            index='user_id', 
                            columns='business_id', 
                            fill_value=0)

In [13]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,-Bv-HHUs8aHzDrdWcZHn8w,-C8sSrFqaCxp51pyo-fQLQ,-CQokjildrY7UZezXCdEBw,...,zkhBU5qW_zCy0q4OEtIrsA,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
---udAKDsn0yQXmzbWQNSw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--0sXNBv6IizZXuV-nl0Aw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--2bpE5vyR-2hAP7sZZ4lA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--2vR0DIsmQ6WfcSzKWigw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_utility.shape

(155937, 4383)

In [17]:
# There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender
df_utility.loc[~(df_utility==0).all(axis=1)]
df_utility.shape

(155937, 4383)

## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [36]:
import scipy
from scipy.sparse import csr_matrix
utility_mat = scipy.sparse.csr_matrix(df_utility.values)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)

In [38]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [39]:
# Let's pick a lucky user
user_id = 111

In [41]:
from time import time
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
items_rated_by_this_user = utility_mat[user_id].nonzero()[1]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = utility_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print (pred_ratings)
print ("Execution time: %f seconds" % (time()-start_time))

  del sys.path[0]


[ 0.  0.  0. ...,  0.  0.  0.]
Execution time: 16.450273 seconds


In [42]:
# Recommend n restaurants
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
items_rated_by_this_user = utility_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[-n:]

[800, 2218, 2308, 1298, 2566, 3959, 219, 3583, 72, 2854]

In [57]:
from numpy import array
index = []
for item in unrated_items_by_pred_rating[-n:]:
    index.append(list(df_utility)[item])
index

['AtD6B83S4Mbmq0t7iDnUVA',
 'VP10-f6y0I94fUh1xY2Ruw',
 'WasMhp4Me2pjbWMoKIWDpw',
 'I22wQGTkLFGn6jTOhwoecQ',
 'ZyOLevFrV7Vxi0OOS3lf-w',
 'taaYWCkUulZL56qZmS1VzQ',
 '2IvrdAb6zdxr3ZqplqJHbg',
 'nt2-Zk4FmGY2SYSDBI0gHw',
 '06MlxbtB4ZYeg_ri02RIAQ',
 'dYpnpMZowfGsCisHhABPZg']

In [64]:
business_name = []
for x in index:
    business_name.append(df.loc[df['business_id']==x,'name'].iloc[0])
business_name

['Veggie House',
 'Tropical Smoothie Cafe',
 'Tokyo Grill Express',
 "Pepe's Tacos",
 'Fork & Burger',
 'Jack In The Box',
 'Jose Cuervo Tequileria',
 'Durango Taco Shop',
 'El Pollo Loco',
 "Samurai Sam's"]

## 3. Matrix Factorization recommender

Take a look at Graphlab Create examples

## 1) Load your data in Dato's SFrame type.

In [3]:
import numpy as np
import graphlab;
import pandas as pd
import matplotlib.pyplot as plt

In [71]:
sf = graphlab.SFrame(df_recommender[['business_id', 'user_id', 'stars']])

## 2) Create a matrix factorization model.

In [72]:
rec = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False)

## 3) Call the `predict` method on your input data to get the predicted rating for user 1st of business 100th.

In [80]:
sf[99]['business_id']

'--9e1ONYQuAa-CB_Rrw7Tw'

In [82]:
sf[0]['user_id']

'4LxKRRIikhr65GfPDW626w'

In [83]:
one_datapoint_sf = graphlab.SFrame({'user_id': [sf[0]['user_id']], 'business_id': [sf[99]['business_id']]})

In [84]:
one_datapoint_sf

business_id,user_id
--9e1ONYQuAa-CB_Rrw7Tw,4LxKRRIikhr65GfPDW626w


In [85]:
print "rating:", rec.predict(one_datapoint_sf)[0]

rating: 4.97739696826


## 4) On the returned model object, call the list_fields method to see what kind of data is stored for your model

In [15]:
rec.list_fields()

['adagrad_momentum_weighting',
 'additional_iterations_if_unhealthy',
 'binary_target',
 'coefficients',
 'data_load_time',
 'init_random_sigma',
 'item_id',
 'item_side_data_column_names',
 'item_side_data_column_types',
 'linear_regularization',
 'max_iterations',
 'model_name',
 'nmf',
 'num_factors',
 'num_features',
 'num_item_side_features',
 'num_items',
 'num_observations',
 'num_tempering_iterations',
 'num_user_side_features',
 'num_users',
 'observation_data_column_names',
 'random_seed',
 'regularization',
 'regularization_type',
 'sgd_convergence_interval',
 'sgd_convergence_threshold',
 'sgd_max_trial_iterations',
 'sgd_sampling_block_size',
 'sgd_step_adjustment_interval',
 'sgd_step_size',
 'sgd_trial_sample_minimum_size',
 'sgd_trial_sample_proportion',
 'side_data_factorization',
 'solver',
 'step_size_decrease_rate',
 'target',
 'tempering_regularization_start_value',
 'track_exact_loss',
 'training_rmse',
 'training_stats',
 'training_time',
 'user_id',
 'user_side_

## 5) Inspect the output of `get('coefficients')` to see what information your model uses.

In [16]:
rec['coefficients'] 

{'business_id': Columns:
 	business_id	str
 	linear_terms	float
 	factors	array
 
 Rows: 4383
 
 Data:
 +------------------------+--------------+-------------------------------+
 |      business_id       | linear_terms |            factors            |
 +------------------------+--------------+-------------------------------+
 | --9e1ONYQuAa-CB_Rrw7Tw |     0.0      | [0.00316447019577, 0.20009... |
 | -3zffZUHoY8bQjGfPSoBKQ |     0.0      | [-0.451097548008, -0.40617... |
 | -8R_-EkGpUhBk55K9Dd4mg |     0.0      | [0.785061597824, 1.1239950... |
 | -9YyInW1wapzdNZrhQJ9dg |     0.0      | [0.385428905487, 0.7031893... |
 | -AD5PiuJHgdUcAK-Vxao2A |     0.0      | [0.835258662701, -0.677945... |
 | -Bf8BQ3yMk8U2f45r2DRKw |     0.0      | [-0.914222419262, -0.20376... |
 | -BmqghX1sv7sgsxOIS2yAg |     0.0      | [-0.343733400106, 0.421559... |
 | -Bv-HHUs8aHzDrdWcZHn8w |     0.0      | [0.237223729491, 0.3331679... |
 | -C8sSrFqaCxp51pyo-fQLQ |     0.0      | [-0.599766373634, -0.23369...

In [17]:
##check the dimensionality of restaurants and a user array in the coefficients.default latent features: 8
rest_sf = rec['coefficients']['business_id']
print len(rest_sf)
print len(rest_sf['factors'][0])
user_sf = rec['coefficients']['user_id']
print len(user_sf)
print len(user_sf['factors'][0])

4383
8
155937
8


## 6) Without using the predict method, compute the predicted rating user 1 of rest 100

In [20]:
intercept = rec['coefficients']['intercept']
print "intercept:", intercept
print "average:", np.average(sf['stars'])

 intercept: 3.80383587206
average: 3.80383587206


In [86]:
##compare with result in step 3, expectes value rating: 4.97739696826
rest_array = rest_sf[rest_sf['business_id'] == sf[99]['business_id']]['factors'][0]
user_array = user_sf[user_sf['user_id'] == sf[0]['user_id']]['factors'][0]
print "rating:", np.dot(rest_array, user_array) + intercept    

rating: 4.95552938322


In [21]:
#RMSE reported by the model diagnostics is correct
from sklearn.metrics import mean_squared_error

predictions = rec.predict(sf)
rmse = np.sqrt(mean_squared_error(sf['stars'], predictions))

print "graphlab's reported rmse:", rec['training_rmse']
print "calculated rmse:", rmse

graphlab's reported rmse: 0.211406392649
calculated rmse: 0.211406392649


In [23]:
pd.Series(sf['stars']).describe()

count    325819.000000
mean          3.803836
std           1.427228
min           1.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
dtype: float64

## 7) Regularization

In [24]:
random_seed = 0
rec2 = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False,
            regularization=0,
            random_seed=random_seed)
print "training rmse with regularization 0:", rec2['training_rmse']   # 0.725

regularization_param = 1e-4
rec3 = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False,
            regularization=regularization_param,
            random_seed=random_seed) 
print "training rmse with regularization %s:"%regularization_param, rec3['training_rmse']

training rmse with regularization 0: 0.209119928069


training rmse with regularization 0.0001: 1.38608454068


In [98]:
##tune parameters to get the bast model
kfolds = graphlab.cross_validation.KFold(sf, 5)
params = dict(user_id='user_id',
            item_id='business_id',
            target='stars',
              solver='als', 
              side_data_factorization=False)
paramsearch = graphlab.model_parameter_search.create(
                    kfolds,
                    graphlab.recommender.factorization_recommender.create,
                    params)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Feb-04-2018-16-27-3200000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Feb-04-2018-16-27-3200000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Feb-04-2018-16-27-3200000' already exists. Renaming the job to 'Model-Parameter-Search-Feb-04-2018-16-27-3200000-2089d'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Feb-04-2018-16-27-3200000-2089d' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Feb-04-2018-16-27-3200000-2089d' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Feb-04-2018-16-27-3200001' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Feb-04-2018-16-27

In [108]:
paramsearch.get_status()

{'Canceled': 0, 'Completed': 50, 'Failed': 0, 'Pending': 0, 'Running': 0}

In [109]:
##print best model by different metrics
##runing for hours 
from pprint import pprint

print "best params by recall@5:"
pprint(paramsearch.get_best_params('mean_validation_recall@5'))

print "best params by precision@5:"
pprint(paramsearch.get_best_params('mean_validation_precision@5'))

print "best params by rmse:"
pprint(paramsearch.get_best_params('mean_validation_rmse'))

best params by recall@5:
{'item_id': 'business_id',
 'linear_regularization': 1e-09,
 'max_iterations': 25,
 'num_factors': 16,
 'regularization': 1e-07,
 'side_data_factorization': False,
 'solver': 'als',
 'target': 'stars',
 'user_id': 'user_id'}
best params by precision@5:
{'item_id': 'business_id',
 'linear_regularization': 1e-09,
 'max_iterations': 25,
 'num_factors': 16,
 'regularization': 1e-07,
 'side_data_factorization': False,
 'solver': 'als',
 'target': 'stars',
 'user_id': 'user_id'}
best params by rmse:
{'item_id': 'business_id',
 'linear_regularization': 1e-09,
 'max_iterations': 25,
 'num_factors': 8,
 'regularization': 1e-08,
 'side_data_factorization': False,
 'solver': 'als',
 'target': 'stars',
 'user_id': 'user_id'}


## 4. Other recommenders (optional)

What are other ways you can build a better recommender?

* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)
* Popularity-based
* Content-based
* Hybrid

1) distance, if we can locate the user, can calculate distance based on business location 

2)based on categories/attributes in business data: content based

3)based number of reviews/avg stars/number of 5 rate in business data: popularity based

4)hybrid: define a penalty for multiple methods and calculated the culmulative results