# Yelp Data Challenge - Restaurant Recommender

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [3]:
df.head(2)

Unnamed: 0,funny,user_id,name,index,text,avg_stars,business_id,stars,date,useful,type,review_id,categories,cool
0,0,0XVzm4kVIAaH4eQAxWbhvw,Delmonico Steakhouse,0,I mainly went for the ceasar salad prepared ta...,4.0,--9e1ONYQuAa-CB_Rrw7Tw,1,2015-06-26,0,review,nCqdz-NW64KazpxqnDr0sQ,"[Steakhouses, Restaurants, Cajun/Creole]",0
1,0,2aeNFntqY2QDZLADNo8iQQ,Delmonico Steakhouse,4,Nice atmosphere and wonderful service. I had t...,4.0,--9e1ONYQuAa-CB_Rrw7Tw,4,2015-06-29,0,review,iwx6s6yQxc7yjS7NFANZig,"[Steakhouses, Restaurants, Cajun/Creole]",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347619 entries, 0 to 347618
Data columns (total 14 columns):
funny          347619 non-null int64
user_id        347619 non-null object
name           347619 non-null object
index          347619 non-null int64
text           347619 non-null object
avg_stars      347619 non-null float64
business_id    347619 non-null object
stars          347619 non-null int64
date           347619 non-null object
useful         347619 non-null int64
type           347619 non-null object
review_id      347619 non-null object
categories     347619 non-null object
cool           347619 non-null int64
dtypes: float64(1), int64(5), object(8)
memory usage: 37.1+ MB


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [5]:
# Get business_id, user_id, stars for recommender
rating_df = df[["user_id","business_id", "stars"]]

#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways? it's like cold start problem, we can use content-based, populairity recommendations

In [6]:
# To be implemented
num_user_rev = rating_df.groupby(by = "user_id").size()
print num_user_rev

user_id
---1lKK3aKOuomHnwAkAow     4
--0sXNBv6IizZXuV-nl0Aw     1
--2bpE5vyR-2hAP7sZZ4lA     1
--2vR0DIsmQ6WfcSzKWigw     2
--3WaS23LcIXtxyFULJHTA     3
--56mD0sm1eOogphi2FFLw     2
--6u02ZqjZRnwtX3t9bZtQ     1
--9yZb1OLNN18HyDXgZrJA     1
--CIuK7sUpaNzalLAlHJKA     1
--DxiDMQgN08E5gTM0aj7Q     1
--EeOyjMPIklMo8fN2GdWQ     1
--FL8jpOVyPYGpyRN007vg     1
--IFH_sbTkfXxbXO4nSEyQ     1
--J3HPoNe-IJ0xE10Z_sDg     1
--KC9gVPd0pTqvlV4AwN6g     1
--LUapetRSkZpFZ2d-MXLQ     8
--MO0Mi5MlB9A-59HYcteA     1
--PJ1FuEoTEo-3Cxf_izRg     2
--QdTWzjRUHa_OkQw-ug8g     1
--RlSfc-QmcHFGHyX6aVjA    10
--VFHSTk6Hd3U6D1jYPXAw     1
--W0Zo_aIlD-7JzfHI4IqQ     1
--XroDUidjD1PcmgahDk2w     1
--YMdHqsfs5ZJpAPtIuyKw     1
--ZNfWKj1VyVElRx6-g1fg    10
--_H9j6ggxvqhh9nPofZwg     1
--df41HLpRbMhMuViA9pFA     1
--fpTdHQOGWGbAjk9SUyeA     1
--i0PK1aTXScdV2UkNDkSQ     1
--kMhfqxhJ7sEDiRCSKO0A     1
                          ..
zzK4WWQhMbnuY77kG-45PA     1
zzKQjelI5GDwrWqDmz_CIQ     1
zzL4E_gdoSCtgNhW4BPY8w     1
zzMJLd

In [7]:
print num_user_rev.min()
print num_user_rev.max()
print num_user_rev.median()

1
581
1.0


In [8]:
num_user_rev.value_counts()

1      104501
2       25901
3       11431
4        5821
5        3542
6        2275
7        1545
8        1028
9         843
10        594
11        460
12        368
13        291
14        225
15        197
16        176
18        157
17        135
19         95
20         95
21         72
22         70
23         69
29         56
25         53
27         53
24         52
28         43
30         34
26         30
        ...  
100         1
109         1
102         1
231         1
104         1
111         1
113         1
132         1
114         1
115         1
119         1
221         1
121         1
91          1
77          1
57          1
63          1
581         1
72          1
160         1
158         1
79          1
215         1
81          1
153         1
83          1
84          1
179         1
148         1
123         1
dtype: int64

In [9]:
# filtering out users with less than 20 reviews
users = num_user_rev.index[(num_user_rev >= 20)]

In [10]:
fil_rating_df = rating_df[(rating_df.user_id.isin(users))]

In [11]:
fil_rating_df.groupby(by = 'user_id').size()

user_id
-2gOxVWcnBr5DclrrsWXCA    33
-4JDJeFS0YAYSiSvIshGLQ    24
-50XWnmQGqBgEI-9ANvLlg    74
-594af_E7Z9VVjQc9pJK3g    65
-5ye1ya0wRQhNRF9NfQ1fA    22
-C-l8EHSLXtZZVfUAUhsPA    20
-EJorVxe7h2GSxdiRyMmDA    52
-IM1qwYck6nqKizjCzR_ww    20
-KeEr7ZLjbOskY6GGE54hQ    38
-Ox7QPz0G56GOzT0ex-f5Q    22
-PAOMIe7lqq1x5GB1pcKMg    26
-SWRw9-1ARVKLAWE0Sxw1Q    38
-YLiMJ0xeWxqny9O9YKzoA    22
-YV1yESQXqR3vpIgBjKDsw    22
-Z0uXJn_uP3U0h-e31sTKw    21
-d1-LUHXVOw2t3bdJHg26Q    23
-gSz76_bKNJsKM2adr12yw    38
-gzqF2ucnXwHf64_k2Hoxg    22
-hYYjAXSAa657rY0ANtTGQ    22
-j-8EX-ebLXybLAdDQOGzw    22
-ki-qIZHP4zZ2D49-b8ZzA    25
-oA7Jp8N-3zKnNGRFb2-Yw    54
-ouzfV5nm0Fmv3JRLXegmQ    54
-uk4wYCSmjWz8vH9XkqroA    25
-xDW3gYiYaoeVASXywTPgw    71
048mif2uzPN800T90sAmyw    22
0BoO48jZw2kjJjwgwIjbLw    39
0FMte0z-repSVWSJ_BaQTg    28
0H-zIyvxf94D11173f63xg    20
0HuYmSeNfDwBJGHmTJa8rg    21
                          ..
yEmeRQb4WH6NN0IAaTX_lw    62
yHbrIShhhHkkl20_KGP4OA    22
yKAIxV3KCxBKK5ezsry_0Q    46
yLCsv3

In [12]:
# that's gonna be the dimention of our rating matrix below
print fil_rating_df.user_id.unique().shape
print fil_rating_df.business_id.unique().shape

(1118L,)
(3514L,)


#### Create utility matrix from records

In [13]:
# To be implemented
ratings_matrix = pd.pivot_table(data = fil_rating_df, index = "user_id", columns = "business_id", values = "stars")

In [14]:
ratings_matrix.head(3)

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,-Bv-HHUs8aHzDrdWcZHn8w,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zsT04OrUZ25WILxbp1S3XQ,zt9RLUIU32fZYOBh2L0NNQ,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-2gOxVWcnBr5DclrrsWXCA,,,,,,,,,,,...,,,,,,,,,,
-4JDJeFS0YAYSiSvIshGLQ,,,,,,,,,,,...,,,,,,,,,,
-50XWnmQGqBgEI-9ANvLlg,,,,,,,,,,,...,,,,,,,,,,


In [15]:
ratings_matrix[ratings_matrix.isnull()] = 0

In [16]:
ratings_matrix.head(2)

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,-Bv-HHUs8aHzDrdWcZHn8w,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zsT04OrUZ25WILxbp1S3XQ,zt9RLUIU32fZYOBh2L0NNQ,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-2gOxVWcnBr5DclrrsWXCA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4JDJeFS0YAYSiSvIshGLQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [17]:
from scipy.sparse import lil_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
ratings_llmatrix = lil_matrix(ratings_matrix)

In [19]:
# calculate similarity matrix
# and neighborhoods' index
def get_sim_nei(rating_matrix, num_nei):
    
    sim_matrix = cosine_similarity(rating_matrix.T)
    nei_index = sim_matrix.argsort()[:,-num_nei:]
    
    return sim_matrix, nei_index

In [20]:
# predict for one user
def pred(user_id,ratings_matrix, sim_matrix, nei_index):
    '''
    Input: user_id 
           ratings_matrix: users by items
           sim_matrix: item by item
           nei_index: item by neighborhoods
    Output: 
           prediction: 1-D array, non-zero elements represent estimated score for item_to_rate
    '''
    # find index of movies that have been rated by this user, in this case index also is movie id
    rated_item_index = ratings_matrix[user_id].nonzero()[1]
    # initialize a prediction array
    prediction = np.zeros(ratings_matrix.shape[1])
    # make prediction on movies
    for item_to_rate in range(ratings_matrix.shape[1]):
        # only predect items not rated by this user yet
        if item_to_rate not in rated_item_index:
            # find index of intersetion between rated movie by this user and this item's neighborhoods
            relevant_item_index = np.intersect1d(nei_index[item_to_rate],rated_item_index, assume_unique = True)
            # make predictions on this item for this user
            prediction[item_to_rate] = (ratings_matrix[user_id,relevant_item_index] * \
                                        sim_matrix[item_to_rate,relevant_item_index]) / \
                                        sim_matrix[item_to_rate,relevant_item_index].sum()
    return prediction

In [21]:
num_nei = 100
sim_matrix, nei_index = get_sim_nei(ratings_llmatrix, num_nei)

In [22]:
# name a user
user_id = np.random.randint(low = 0, high = ratings_llmatrix.shape[0])

In [23]:
prediction = pred(user_id, ratings_llmatrix, sim_matrix, nei_index)

In [24]:
rec_bus_index = prediction.argsort()[-5:]

In [25]:
user_name = ratings_matrix.index[user_id]
rec_bus_name = [ratings_matrix.columns[i] for i in rec_bus_index]

In [26]:
print "for user ", user_name
print "we recommend ", rec_bus_name

for user  lOiJBKYO4sVF9eAIj4233A
we recommend  ['R6jpwPBe9edy0P64uyaLKA', 'R7LyTeiOHLyTNkA8HssBRQ', 'RBVtuVodydOMkNr-RVTVYA', 'Qj1BoynaIVZiKEe8Peq3_g', 'UVqelOc_xJwy6lfnOhiCLQ']


## 3. Matrix Factorization recommender

Take a look at Graphlab Create examples

In [27]:
import graphlab

In [28]:
sf = graphlab.SFrame(rating_df)

This non-commercial license of GraphLab Create for academic use is assigned to yihaoson@usc.edu and will expire on June 29, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\songy\AppData\Local\Temp\graphlab_server_1499135466.log.0


In [29]:
sf.head(3)

user_id,business_id,stars
0XVzm4kVIAaH4eQAxWbhvw,--9e1ONYQuAa-CB_Rrw7Tw,1
2aeNFntqY2QDZLADNo8iQQ,--9e1ONYQuAa-CB_Rrw7Tw,4
WFhv5pMJRDPWSyLnKiWFXA,--9e1ONYQuAa-CB_Rrw7Tw,2


In [30]:
rec = graphlab.recommender.factorization_recommender.create(observation_data = sf, user_id = 'user_id', 
                                                      item_id = 'business_id', target = 'stars', 
                                                      side_data_factorization = False, solver = 'als')

In [31]:
# make prediction
user_index = np.random.randint(low = 0, high = ratings_llmatrix.shape[0])
bus_index = np.random.randint(low = 0, high = ratings_llmatrix.shape[1])

user_id =  ratings_matrix.index[user_index]
bus_id = ratings_matrix.columns[bus_index]

one_datapoint_sf = graphlab.SFrame({'user_id': [user_id], 'business_id': [bus_id]})

In [32]:
one_datapoint_sf

business_id,user_id
9sbXPpRC_wDhCsp7qG2PDw,nuMPCVF6YFEVgnysW7rZGQ


In [53]:
print "stars:", (5 if rec.predict(one_datapoint_sf)[0] > 5 else np.floor(rec.predict(one_datapoint_sf)[0]))

stars: 2.0


In [71]:
# I think may be we should use ranking factorize recommeder

## 4. Other recommenders (optional)

What are other ways you can build a better recommender?

* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)
* Popularity-based
* Content-based
* Hybrid

# let's make a simplest recommender -- Popularity-based Recommender

In [50]:
# let's choose the best 10 restaurants to recommend for any one
# restaurants that have highest average stars and most reviews are the best
top_res = rating_df.groupby(by = 'business_id').agg(
    {'stars': np.mean, 'user_id': np.size}).sort_values(
    by = ['stars', 'user_id'], ascending = False)

In [52]:
top_res.index[:10]

Index([u'9kPbAt95kECBGGi_e5yV0Q', u'WU0rArtd0GcTt8ndKzeKaw',
       u'prIta4agQiDpUQVY3MliFA', u'BjrKNWhtQkedHw8hP_0Bjg',
       u'-CQokjildrY7UZezXCdEBw', u'dtqT51H8Q8mIvrLylVuiZg',
       u'TRe0u0yJ4eeEaSzDKQmIDA', u'ylwIzIHOjk5ECMoD6Af-cg',
       u'4UvQ0BUbqlNPJbAggsi0eA', u'8hZjiPzJIojA1k7_W4hELA'],
      dtype='object', name=u'business_id')

In [54]:
# let's use graphlab to build a Popularity-based Recommender
# Create a model that makes recommendations using item popularity. 
# When no target column is provided, the popularity is determined by the number of observations involving each item. 
# When a target is provided, popularity is computed using the item’s mean target value. 
# When the target column contains ratings, 
# for example, the model computes the mean rating for each item and uses this to rank items for recommendations.

In [55]:
pop_rec = graphlab.recommender.popularity_recommender.create(sf, user_id = 'user_id', item_id = 'business_id', target = 'stars')

In [63]:
score = np.array(pop_rec.predict(sf[['user_id', 'business_id']]))

In [65]:
index = score.argsort()[::-1][:10]

In [69]:
print "we recommend ", [rating_df.ix[i,'business_id'] for i in index]

we recommend  ['xxaqaD9IVsqxiNF8FoHdYA', '5k8kyvvReTf1RMq4jpayQg', '5k8kyvvReTf1RMq4jpayQg', '5k8kyvvReTf1RMq4jpayQg', '5ic8m9PNixMA__h0G4WH5w', '5fSz08wFcBhCeCXh7sxG5A', '5fSz08wFcBhCeCXh7sxG5A', 'MD5aTUMKi-Qk0-jr3xRgng', 'MVjkFLHNJmbQ-ZTd3QY3nQ', 'MVjkFLHNJmbQ-ZTd3QY3nQ']


In [70]:
# that's quite a difference than ours
# but I beleive we are better.

# let's look at content-based recommender

In [73]:
# import business data
bus_df = pd.read_csv('data/yelp_academic_dataset_business.csv')

In [75]:
bus_df.head(3)

Unnamed: 0,neighborhood,business_id,hours,is_open,address,attributes,categories,city,review_count,name,longitude,state,stars,latitude,postal_code,type
0,,0DI8Dt2PJp07XkVvIElIcQ,"['Monday 11:0-21:0', 'Tuesday 11:0-21:0', 'Wed...",0,"227 E Baseline Rd, Ste J2","['BikeParking: True', 'BusinessAcceptsBitcoin:...","['Tobacco Shops', 'Nightlife', 'Vape Shops', '...",Tempe,17,Innovative Vapors,-111.936102,AZ,4.5,33.378214,85283,business
1,,LTlCaCGZE14GuaUXUGbamg,"['Monday 0:0-0:0', 'Tuesday 0:0-0:0', 'Wednesd...",1,495 S Grand Central Pkwy,"['BusinessAcceptsBitcoin: False', 'BusinessAcc...","['Caterers', 'Grocery', 'Food', 'Event Plannin...",Las Vegas,9,Cut and Taste,-115.159272,NV,5.0,36.192284,89106,business
2,Dufferin Grove,EDqCEAGXVGCH4FJXgqtjqg,"['Monday 11:0-2:0', 'Tuesday 11:0-2:0', 'Wedne...",1,979 Bloor Street W,"['Alcohol: none', ""Ambience: {'romantic': Fals...","['Restaurants', 'Pizza', 'Chicken Wings', 'Ita...",Toronto,7,Pizza Pizza,-79.429089,ON,2.5,43.661054,M6H 1L5,business


In [88]:
bus_sf = graphlab.SFrame(bus_df.dropna())

In [89]:
con_rec = graphlab.recommender.item_content_recommender.create(item_data = bus_sf, item_id = 'business_id'
                                                               , observation_data = sf, user_id = 'user_id', target = 'stars')

('Applying transform:\n', Class             : AutoVectorizer

Model Fields
------------
Features          : ['neighborhood', 'hours', 'is_open', 'address', 'attributes', 'categories', 'city', 'review_count', 'name', 'longitude', 'state', 'stars', 'latitude', 'postal_code', 'type']
Excluded Features : ['business_id']

Column        Type   Interpretation  Transforms                         Output Type
------------  -----  --------------  ---------------------------------  -----------
neighborhood  str    categorical     None                               str        
hours         str    long_text       2-Word NGram Counts -> TFIDF       dict       
is_open       int    categorical     astype(str)                        str        
address       str    short_text      3-Character NGram Counts -> TFIDF  dict       
attributes    str    long_text       2-Word NGram Counts -> TFIDF       dict       
categories    str    short_text      3-Character NGram Counts -> TFIDF  dict       
city     

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [90]:
con_rec.recommend()

user_id,business_id,score,rank
0XVzm4kVIAaH4eQAxWbhvw,YnmtUJGqQIQL9tz1MRyfqA,0.0194874048233,1
0XVzm4kVIAaH4eQAxWbhvw,f9sU31meK0bqAD7922sCog,0.0191907739639,2
0XVzm4kVIAaH4eQAxWbhvw,LFs5jyYdXlzi0SpAYi1eSA,0.0191749799252,3
0XVzm4kVIAaH4eQAxWbhvw,HGitzBs7x_fUvdTtrTacXg,0.0190156114101,4
0XVzm4kVIAaH4eQAxWbhvw,EAwh1OmG6t6p3nRaZOW_AA,0.0180125570297,5
0XVzm4kVIAaH4eQAxWbhvw,0N53m33GANYeHHl-s22d4Q,0.0179874742031,6
0XVzm4kVIAaH4eQAxWbhvw,ZjSzUWHtnpCfjsa7CksSOg,0.0178435564041,7
0XVzm4kVIAaH4eQAxWbhvw,kJl3l4fSa2spv_Mkhgogmg,0.0174511444569,8
0XVzm4kVIAaH4eQAxWbhvw,jlm7UtOXnZ8azTpkqqppJA,0.0169328868389,9
0XVzm4kVIAaH4eQAxWbhvw,uWECX6-Uq9n8v5ipk9R29A,0.0165793585777,10
