# Yelp Data Challenge - Restaurant Recommender

BitTiger DS501-1802

May 2018 

Yan Wei

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id,date_new
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ,2016-03-31
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,0,2aeNFntqY2QDZLADNo8iQQ,2015-06-29
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-03-16,0,UVUMu_bELdA56Ryfbur-DA,5,Every year a group of us (we had 6 this year) ...,1,gmPP4YFrgYsYQqPYokMgFA,2015-03-16
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-02-10,0,UxFpgng8dPMWOj99653k5Q,5,Truly Fantastic! Best Steak ever. Service was...,0,aVOGlN9fZ-BXcbtj6dbf0g,2016-02-10
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g,2017-02-14


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [10]:
recommender_df = df[['business_id', 'user_id', 'stars']]

In [11]:
recommender_df.head()

Unnamed: 0,business_id,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,oFyOUOeGTRZhFPF9uTqrTQ,5
1,--9e1ONYQuAa-CB_Rrw7Tw,2aeNFntqY2QDZLADNo8iQQ,4
2,--9e1ONYQuAa-CB_Rrw7Tw,gmPP4YFrgYsYQqPYokMgFA,5
3,--9e1ONYQuAa-CB_Rrw7Tw,aVOGlN9fZ-BXcbtj6dbf0g,5
4,--9e1ONYQuAa-CB_Rrw7Tw,KC8H7qTZVPIEnanw9fG43g,5


In [13]:
# Number of unique users.
len(recommender_df['user_id'].unique())

227241

In [14]:
# Number of unique business. 
len(recommender_df['business_id'].unique())

4832

In [15]:
recommender_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515752 entries, 0 to 515751
Data columns (total 3 columns):
business_id    515752 non-null object
user_id        515752 non-null object
stars          515752 non-null int64
dtypes: int64(1), object(2)
memory usage: 11.8+ MB


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?
**A**: Recommend items by popularity could be a solution.

In [16]:
user_counts = recommender_df['user_id'].value_counts()

In [28]:
# Number of users who comment less than 5 times. 
user_counts[user_counts < 5].sum()

302669

In [40]:
# Save active users' index to a list. 
user_active = user_counts[user_counts >= 5].index.tolist()

In [51]:
len(user_active)

19772

In [44]:
user_active[:5]

['bLbSNkLggFnqwNNzzq-Ijw',
 'JaqcCU3nxReTW2cBLHounA',
 'PKEzKWv_FktMm2mGPjwd0Q',
 'B1829_hxXSEpDPEDJtYeIw',
 'U4INQZOPSUaj8hMjLlZ3KA']

In [45]:
# Exclude users who comment less than 5 times, save data in 'recommend_active_df'.
recommend_active_df = recommender_df[recommender_df.user_id.isin(user_active)]

#### Create utility matrix from records

In [48]:
# To be implemented

df_utility = pd.pivot_table(data = recommend_active_df,
                            values = 'stars',
                            index = 'user_id',
                            columns = 'business_id',
                            fill_value = 0)

In [49]:
df_utility.shape

(19772, 4593)

In [50]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--LUapetRSkZpFZ2d-MXLQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--RlSfc-QmcHFGHyX6aVjA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--ZNfWKj1VyVElRx6-g1fg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
-00kdEIhCt-ODaV4BS-EAg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [53]:
# Calculate item-item similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(df_utility.T)

In [54]:
item_sim.shape

(4593, 4593)

In [55]:
# Calculate neighborhood
least_to_most_sim_indexes = np.argsort(item_sim, axis = 1)

In [96]:
neighborhood_size = 500 

neighborhood = least_to_most_sim_indexes[:, -neighborhood_size:]

In [68]:
neighborhood.shape

(4593, 200)

In [95]:
# Make rating prediction on a user

user_id = 520

df_utility.iloc[user_id, :].nonzero()

(array([ 266,  528, 1175, 1496, 1880, 1947, 1963, 2037, 3203, 3746, 3774,
        4459, 4584]),)

In [100]:
type(item_sim)

numpy.ndarray

In [116]:
n_users = df_utility.shape[0]
n_items = df_utility.shape[1]

item_rated_by_the_user = df_utility.iloc[user_id, :].nonzero()[0]

out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhood[item_to_rate],
                                    item_rated_by_the_user,
                                    assume_unique = True)
    out[item_to_rate] = (df_utility.as_matrix()[user_id, relevant_items] * \
                        item_sim[user_id, relevant_items]).sum() / \
                        item_sim[user_id, relevant_items].sum()
            
pred_ratings = np.nan_to_num(out)
print(pred_ratings)

  # This is added back by InteractiveShellApp.init_path()


[ 4.          3.37834083  3.         ...,  3.          4.          4.        ]


In [164]:
# Set final recommendations number equal to 10 for a user.
n = 10

# Sort item index by pred_ratings.
item_index_sorted = pred_ratings.argsort()[::-1]

# Item rated by the user. 
item_rated_by_the_user = df_utility.iloc[user_id, :].nonzero()[0]



In [165]:
# Make recommendation list of businesses that have not been rated by the user. 
item_to_recommend = []
for item in item_index_sorted:
    if item not in item_rated_by_the_user:
        item_to_recommend.append(item)
        if len(item_to_recommend) == n:
            break
print(item_to_recommend)

[1127, 928, 3497, 3056, 1787, 1031, 2689, 3483, 3345, 4441]


In [166]:
# Print out associated business_id
pd.DataFrame(recommender_df['business_id'][item_to_recommend], columns = ['business_id'], index = item_to_recommend)

Unnamed: 0,business_id
1127,-Bf8BQ3yMk8U2f45r2DRKw
928,-ADtl9bLp8wNqYX1k3KuxA
3497,-U7tvCtaraTQ9b0zBhpBMA
3056,-Qkx7W0itbAApcG5lJuMFQ
1787,-IWsoxH7mLJTTpU5MmWY4w
1031,-ADtl9bLp8wNqYX1k3KuxA
2689,-PGsEXB6DFTVKa1eDOlzWA
3483,-U7tvCtaraTQ9b0zBhpBMA
3345,-U7tvCtaraTQ9b0zBhpBMA
4441,-Ylpy3VyRWwubf9dysuwjQ


## 3. Matrix Factorization recommender

##### Compare two of the methods demoed in Practice Class: sklearn NMF, sklearn TruncatedSVD, or GraphLab
##### *Extra points for using GraphLab

##### Prepare rating matrix 

In [144]:
df_utility.shape

(19772, 4593)

In [154]:
ratings_mat = df_utility.as_matrix()

##### NMF

In [156]:

from sklearn.decomposition import NMF
n_components = 200

nmf = NMF(n_components = 200)
nmf.fit(ratings_mat)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=200, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [157]:
# V = W * H, get W, H and error.
W = nmf.transform(ratings_mat)
H = nmf.components_
err = nmf.reconstruction_err_

print(err)
print(W.shape, H.shape)

1508.40501437
(19772, 200) (200, 4593)


In [160]:
# Reconstruct
ratings_mat_fitted = W.dot(H)
errs = np.array((ratings_mat - ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat).flatten()).squeeze() > 0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()

print(mse)
print(average_abs_err)

10.1508932859
2.5717541979


In [167]:
# Get top n recommendations for the user i. 
user_id = 520
n = 10

pred_ratings_by_useri = ratings_mat_fitted[user_id, :]

# Sort item index by pred_ratings.
item_index_sorted = pred_ratings_by_useri.argsort()[::-1]

# Item rated by the user. 
item_rated_by_the_user = df_utility.iloc[user_id, :].nonzero()[0] ## Difference between 【0】 and 【1】

# Predicted ratings of items not rated by the user.
item_to_recommend_nmf = [item for item in item_index_sorted if item not in item_rated_by_the_user]

# Recommend n items for user i. 
print('Bussiness recommended to user_id = %s:' % user_id)
print(item_to_recommend_nmf[:n])

Bussiness recommended to user_id = 520:
[4237, 3738, 610, 2500, 1021, 1406, 4289, 629, 39, 4419]


In [198]:
# Check errors of ratings of items rated by the user.

ratings_true = df_utility.as_matrix()[user_id, item_rated_by_the_user]

ratings_pred = pred_ratings_by_useri[item_rated_by_the_user]

err_one_user = ratings_true - ratings_pred

print('Mean Error of user_id = %s:' % user_id, abs(err_one_user).mean())
print(pd.DataFrame({'true_rating': ratings_true, 'pred_rating': ratings_pred}))

Mean Error of user_id = 520: 3.00468401218
    pred_rating  true_rating
0      0.066278            5
1      0.078042            4
2      0.038350            3
3      0.115079            1
4      4.927687            5
5      4.049087            4
6      0.062212            2
7      0.109439            4
8      0.195866            5
9      0.097756            4
10     0.097341            4
11     0.062489            4
12     0.137656            4


##### UVD/SVD

In [199]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 200, n_iter = 7, random_state = 0)
svd.fit(ratings_mat)

TruncatedSVD(algorithm='randomized', n_components=200, n_iter=7,
       random_state=0, tol=0.0)

In [200]:
# X = U * V

V = svd.components_
U = svd.transform(ratings_mat)

print(U.shape, V.shape)

(19772, 200) (200, 4593)


In [201]:
# Reconstruct
ratings_mat_fitted = U.dot(V)

errs = np.array((ratings_mat - ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat).flatten()).squeeze() > 0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()

print(mse)
print(average_abs_err)

9.09550132729
2.4959239724


In [202]:
# Get top n recommendations for the user i. 
user_id = 520
n = 10

pred_ratings_by_useri = ratings_mat_fitted[user_id, :]

# Sort item index by pred_ratings.
item_index_sorted = pred_ratings_by_useri.argsort()[::-1]

# Item rated by the user. 
item_rated_by_the_user = df_utility.iloc[user_id, :].nonzero()[0]

# Predicted ratings of items not rated by the user.
item_to_recommend_uvd = [item for item in item_index_sorted if item not in item_rated_by_the_user]

# Recommend n items for user i. 
print('Bussiness recommended to user_id = %s:' % user_id)
print(item_to_recommend_uvd[:n])

Bussiness recommended to user_id = 520:
[4237, 3820, 1966, 2500, 4036, 3813, 2749, 3107, 2143, 493]


In [205]:
# Check errors of ratings of items rated by the user.

ratings_true = df_utility.as_matrix()[user_id, item_rated_by_the_user]

ratings_pred = pred_ratings_by_useri[item_rated_by_the_user]

err_one_user = ratings_true - ratings_pred

print('Mean Error of user_id = %s:' % user_id, abs(err_one_user).mean())
print(pd.DataFrame({'true_rating': ratings_true, 'pred_rating': ratings_pred}))

Mean Error of user_id = 520: 3.11071117174
    pred_rating  true_rating
0      0.094367            5
1      0.427972            4
2      0.124837            3
3      0.166637            1
4      2.013691            5
5      4.024185            4
6      0.157256            2
7      0.810088            4
8      0.148762            5
9      0.226940            4
10     0.112524            4
11     0.156473            4
12     0.145390            4


In [204]:
# Find out common business between top 10 recommendations of NMF and UVD methods.
top10_common_recommend = np.intersect1d(item_to_recommend_nmf[:n], item_to_recommend_uvd[:n])

array([2500, 4237])

## 4. Other recommenders (optional)

What are other ways you can build a better recommender?

* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)
* Popularity-based
* Content-based
* Hybrid