In [1]:
##### Variables #####
K_CLUSTERS_RESTAURANT = 20
K_CLUSTERS_USER = 10
REDUCED_D = 300
TERMS_PER_CLUSTER_RESTAURANT = 5

In [2]:
# import libraries
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans
import logging
from time import time
import numpy as np
import sqlite3
import pandas as pd

%matplotlib notebook
from matplotlib import pyplot as plt

In [3]:
# connect to database
conn = sqlite3.connect("yelp2.db")

In [4]:
# create a new dataframe, df ('user_id', 'business_id', 'stars', 'text' columns from review table)
df = pd.read_sql_query("select text, user_id, business_id, stars from review;", conn)


In [5]:
# user_id with >= 50 reviews
user_id_valcounts = df['user_id'].value_counts()
user_id_valcounts

bLbSNkLggFnqwNNzzq-Ijw    602
PKEzKWv_FktMm2mGPjwd0Q    485
UYcmGbelzRa0Q6JqzLoguw    410
tH0uKD-vNwMoEc3Xk3Cbdg    329
C2C0GPKvzWWnP57Os9eQ0w    324
N3oNEwh0qgPqPP3Em6wJXw    316
n86B7IkbU20AkxlFX_5aew    313
U4INQZOPSUaj8hMjLlZ3KA    302
8DEyKVyplnOcSKx39vatbg    275
qewG3X2O4X6JKskxyyqFwQ    257
3nDUQBjKyVor5wV0reJChg    257
eZZyuJDouIg4p-GYB3PV_A    228
3nIuSCZk5f_2WWYMLN7h3w    217
TdYKJgSgY2GF_YJnwsi5yQ    216
YMgZqBUAddmFErxLtCfK_w    213
L8P5OWO1Jh4B2HLa1Fnbng    212
iSC96O2NjQc3JExGUHQG0Q    212
JaqcCU3nxReTW2cBLHounA    209
kmE8w5Y785eZmodsx0V6Ag    203
_VMGbmIeK71rQGwOBWt_Kg    201
y3FcL4bLy0eLlkb0SDPnBQ    200
oeAhRa8yFa9jtrhaHnOyxQ    189
B1829_hxXSEpDPEDJtYeIw    189
FIk4lQQu1eTe2EpzQ4xhBA    185
8OeTLey-p-WaL9ErNEci1Q    183
epkRB3J_oBEiyiwahvdmGg    183
ZsUSGU1-L1ImomLZjXxxTg    180
HJj82f-csBI7jjgenwqhvw    179
hZfzVrhsCQ9JDAb2jYoJNQ    179
48vRThjhuhiSQINQ2KV8Sw    178
                         ... 
YR8Jx240OaPMEWw7il01-w      1
4mP5AkBYhZ3teUkSG-Zkfw      1
Ge-DjZHgOd

In [6]:
# create a new dataframe, df2 (only include the data with user_id with >= 50 reviews)
top_users = user_id_valcounts[user_id_valcounts >= 50].index.tolist()
df2 = df[df['user_id'].isin(top_users)]
df2

Unnamed: 0,text,user_id,business_id,stars
5,I love love love Patisserie Manon!\n\nI was ex...,-594af_E7Z9VVjQc9pJK3g,A0X1baHPgw9IiBRivu0G9g,5.0
19,I had the pleasure of tasting a couple of thei...,TdYKJgSgY2GF_YJnwsi5yQ,A0X1baHPgw9IiBRivu0G9g,4.0
22,Meep Meep!!\n\nThat was my first thought when ...,y3FcL4bLy0eLlkb0SDPnBQ,A0X1baHPgw9IiBRivu0G9g,5.0
23,I came here on a saturday afternoon and got al...,rHgT6lBbZ2PaHt0UWfTMtw,A0X1baHPgw9IiBRivu0G9g,5.0
28,Love this place. The cakes and pastries are s...,FIk4lQQu1eTe2EpzQ4xhBA,A0X1baHPgw9IiBRivu0G9g,5.0
31,I stopped by on their last day before their Su...,k4M43lXJuQMpQW65DTqzIQ,A0X1baHPgw9IiBRivu0G9g,4.0
34,I really love their cassoulet and have had it ...,7HOTOydGS7wJYaJqNoDfeg,A0X1baHPgw9IiBRivu0G9g,4.0
38,Their macarons are soo unique compared to othe...,5OllWsrKJsYo3XQK6siRKA,A0X1baHPgw9IiBRivu0G9g,5.0
42,"I loved this place. I had the turkey, ham and ...",_i3IZYU28R3xgV4hxkmKjw,A0X1baHPgw9IiBRivu0G9g,5.0
47,"Cake, macaroons, and more oh my! This place is...",fxrQSMhHlBv6eJVRdchG9Q,A0X1baHPgw9IiBRivu0G9g,5.0


In [7]:
df2.head()

Unnamed: 0,text,user_id,business_id,stars
5,I love love love Patisserie Manon!\n\nI was ex...,-594af_E7Z9VVjQc9pJK3g,A0X1baHPgw9IiBRivu0G9g,5.0
19,I had the pleasure of tasting a couple of thei...,TdYKJgSgY2GF_YJnwsi5yQ,A0X1baHPgw9IiBRivu0G9g,4.0
22,Meep Meep!!\n\nThat was my first thought when ...,y3FcL4bLy0eLlkb0SDPnBQ,A0X1baHPgw9IiBRivu0G9g,5.0
23,I came here on a saturday afternoon and got al...,rHgT6lBbZ2PaHt0UWfTMtw,A0X1baHPgw9IiBRivu0G9g,5.0
28,Love this place. The cakes and pastries are s...,FIk4lQQu1eTe2EpzQ4xhBA,A0X1baHPgw9IiBRivu0G9g,5.0


In [8]:
dict_business= {}
for index, row in df2.iterrows():
    business = str(row['business_id'])
    user = str(row ['user_id'])
    review = str(row['text']).replace('\n','')
    if business in dict_business:
        dict_business[business] += review
    else:
        dict_business[business] = review	
        
#dict_business.items()

rawdata_busi = [ [k2,v2] for k2, v2 in dict_business.items() ]


In [9]:
print('number of unique users: ', len(top_users))
unique_business = list(set(df2['business_id']))
print('number of unique restaurants: ', len(unique_business))

number of unique users:  558
number of unique restaurants:  1620


In [10]:
# we are now left with 49,346 review texts data
len(df2)

49346

In [11]:
from stop_words import get_stop_words
en_stop = get_stop_words('en')

In [12]:
data_b=[]

for entry in rawdata_busi:
    nonstop = []
    for word in entry[1].split():
        if word not in en_stop:
            nonstop.append(word)
    stopresult = ' '.join(nonstop)
    entry[1] = stopresult
    data_b.append(stopresult)

In [13]:
# extract features from review texts
MAX_FEATURES = 100000

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.7, max_features=MAX_FEATURES,
                             min_df=100, stop_words='english',
                             use_idf=True)

# X is a sparse matrix of size 49346 * 4039
X = vectorizer.fit_transform(data_b)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

Extracting features from the training dataset using a sparse vectorizer
done in 4.851238s
n_samples: 1620, n_features: 3186


In [14]:
len(data_b)

1620

In [15]:
X.shape

(1620, 3186)

In [16]:
# reduce dimensionality from 4039 to 200
print("Performing dimensionality reduction using LSA")
t0 = time()
svd = TruncatedSVD(REDUCED_D)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_X = lsa.fit_transform(X)
lsa_X
print("done in %fs" % (time() - t0))
print("(Reduced) n_samples: %d, n_features: %d" % lsa_X.shape)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

Performing dimensionality reduction using LSA
done in 4.426687s
(Reduced) n_samples: 1620, n_features: 300
Explained variance of the SVD step: 78%


In [17]:
# train a k means classifier with K_CLUSTERS_RESTAURANT
from operator import itemgetter
#K_CLUSTERS_RESTAURANT = sorted(accuracy_rc, key=itemgetter(1))[-1][0] # k with best score
km = MiniBatchKMeans(n_clusters=K_CLUSTERS_RESTAURANT, init='k-means++', n_init=5,
                     init_size=100000, batch_size=1000, verbose=True)
km.fit(lsa_X)

Init 1/5 with method: k-means++
Inertia for init 1/5: 898.607737
Init 2/5 with method: k-means++
Inertia for init 2/5: 914.954502
Init 3/5 with method: k-means++
Inertia for init 3/5: 899.928533
Init 4/5 with method: k-means++
Inertia for init 4/5: 909.310215
Init 5/5 with method: k-means++
Inertia for init 5/5: 905.608499
Minibatch iteration 1/200: mean batch inertia: 0.557139, ewa inertia: 0.557139 
Minibatch iteration 2/200: mean batch inertia: 0.556282, ewa inertia: 0.556282 
Minibatch iteration 3/200: mean batch inertia: 0.548330, ewa inertia: 0.548330 
Minibatch iteration 4/200: mean batch inertia: 0.562108, ewa inertia: 0.562108 
Minibatch iteration 5/200: mean batch inertia: 0.552741, ewa inertia: 0.552741 
Minibatch iteration 6/200: mean batch inertia: 0.551367, ewa inertia: 0.551367 
Minibatch iteration 7/200: mean batch inertia: 0.543624, ewa inertia: 0.543624 
Minibatch iteration 8/200: mean batch inertia: 0.543977, ewa inertia: 0.543977 
Minibatch iteration 9/200: mean bat

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=100000, max_iter=100, max_no_improvement=10,
        n_clusters=20, n_init=5, random_state=None,
        reassignment_ratio=0.01, tol=0.0, verbose=True)

In [18]:
# result
print("Top %d terms per cluster:" %TERMS_PER_CLUSTER_RESTAURANT)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(K_CLUSTERS_RESTAURANT):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :TERMS_PER_CLUSTER_RESTAURANT]:
        print(' %s' % terms[ind], end='')
    print()

Top 5 terms per cluster:
Cluster 0: thai pad curry rice spicy
Cluster 1: italian pasta bread wine dish
Cluster 2: sushi roll rolls ayce nigiri
Cluster 3: pizza crust pizzas pepperoni toppings
Cluster 4: buffet buffets station legs crab
Cluster 5: mexican salsa burrito chips beans
Cluster 6: wings wing boneless buffalo fries
Cluster 7: chinese rice noodles soup boba
Cluster 8: burger fries burgers bacon shake
Cluster 9: breakfast eggs pancakes toast hash
Cluster 10: beer bar beers pub hour
Cluster 11: tacos taco asada salsa carne
Cluster 12: pita hummus greek mediterranean lamb
Cluster 13: sandwich coffee sandwiches bread deli
Cluster 14: fries casino dog burger bar
Cluster 15: steak steakhouse filet lobster steaks
Cluster 16: pho vietnamese broth rolls pork
Cluster 17: bbq korean ribs brisket pork
Cluster 18: shrimp rice ramen crab pasta
Cluster 19: smoothie healthy juice wrap protein


In [19]:
from collections import Counter
clusters = km.labels_.tolist()
print(Counter(clusters))

Counter({18: 177, 1: 159, 14: 133, 2: 113, 3: 108, 9: 103, 7: 90, 15: 84, 8: 78, 10: 77, 5: 72, 13: 68, 17: 66, 4: 61, 0: 57, 12: 48, 11: 45, 16: 30, 19: 30, 6: 21})


In [20]:
# add a column 'cluster' to the existing dataframe 
clusters = km.labels_.tolist()
print(len(clusters))
df2.head()

1620


Unnamed: 0,text,user_id,business_id,stars
5,I love love love Patisserie Manon!\n\nI was ex...,-594af_E7Z9VVjQc9pJK3g,A0X1baHPgw9IiBRivu0G9g,5.0
19,I had the pleasure of tasting a couple of thei...,TdYKJgSgY2GF_YJnwsi5yQ,A0X1baHPgw9IiBRivu0G9g,4.0
22,Meep Meep!!\n\nThat was my first thought when ...,y3FcL4bLy0eLlkb0SDPnBQ,A0X1baHPgw9IiBRivu0G9g,5.0
23,I came here on a saturday afternoon and got al...,rHgT6lBbZ2PaHt0UWfTMtw,A0X1baHPgw9IiBRivu0G9g,5.0
28,Love this place. The cakes and pastries are s...,FIk4lQQu1eTe2EpzQ4xhBA,A0X1baHPgw9IiBRivu0G9g,5.0


In [21]:
df2.insert(len(df2.columns),'cluster',0)
df2.head(1000)

Unnamed: 0,text,user_id,business_id,stars,cluster
5,I love love love Patisserie Manon!\n\nI was ex...,-594af_E7Z9VVjQc9pJK3g,A0X1baHPgw9IiBRivu0G9g,5.0,0
19,I had the pleasure of tasting a couple of thei...,TdYKJgSgY2GF_YJnwsi5yQ,A0X1baHPgw9IiBRivu0G9g,4.0,0
22,Meep Meep!!\n\nThat was my first thought when ...,y3FcL4bLy0eLlkb0SDPnBQ,A0X1baHPgw9IiBRivu0G9g,5.0,0
23,I came here on a saturday afternoon and got al...,rHgT6lBbZ2PaHt0UWfTMtw,A0X1baHPgw9IiBRivu0G9g,5.0,0
28,Love this place. The cakes and pastries are s...,FIk4lQQu1eTe2EpzQ4xhBA,A0X1baHPgw9IiBRivu0G9g,5.0,0
31,I stopped by on their last day before their Su...,k4M43lXJuQMpQW65DTqzIQ,A0X1baHPgw9IiBRivu0G9g,4.0,0
34,I really love their cassoulet and have had it ...,7HOTOydGS7wJYaJqNoDfeg,A0X1baHPgw9IiBRivu0G9g,4.0,0
38,Their macarons are soo unique compared to othe...,5OllWsrKJsYo3XQK6siRKA,A0X1baHPgw9IiBRivu0G9g,5.0,0
42,"I loved this place. I had the turkey, ham and ...",_i3IZYU28R3xgV4hxkmKjw,A0X1baHPgw9IiBRivu0G9g,5.0,0
47,"Cake, macaroons, and more oh my! This place is...",fxrQSMhHlBv6eJVRdchG9Q,A0X1baHPgw9IiBRivu0G9g,5.0,0


In [22]:
for index, row  in df2.iterrows():
    bid = str(row['business_id'])
    ind = -1
    for i in range(0, len(rawdata_busi)):
        if(bid == str(rawdata_busi[i][0])):
            ind = i
    df2.set_value(index, 'cluster', clusters[ind])


In [23]:
df2.head(1000)

Unnamed: 0,text,user_id,business_id,stars,cluster
5,I love love love Patisserie Manon!\n\nI was ex...,-594af_E7Z9VVjQc9pJK3g,A0X1baHPgw9IiBRivu0G9g,5.0,13
19,I had the pleasure of tasting a couple of thei...,TdYKJgSgY2GF_YJnwsi5yQ,A0X1baHPgw9IiBRivu0G9g,4.0,13
22,Meep Meep!!\n\nThat was my first thought when ...,y3FcL4bLy0eLlkb0SDPnBQ,A0X1baHPgw9IiBRivu0G9g,5.0,13
23,I came here on a saturday afternoon and got al...,rHgT6lBbZ2PaHt0UWfTMtw,A0X1baHPgw9IiBRivu0G9g,5.0,13
28,Love this place. The cakes and pastries are s...,FIk4lQQu1eTe2EpzQ4xhBA,A0X1baHPgw9IiBRivu0G9g,5.0,13
31,I stopped by on their last day before their Su...,k4M43lXJuQMpQW65DTqzIQ,A0X1baHPgw9IiBRivu0G9g,4.0,13
34,I really love their cassoulet and have had it ...,7HOTOydGS7wJYaJqNoDfeg,A0X1baHPgw9IiBRivu0G9g,4.0,13
38,Their macarons are soo unique compared to othe...,5OllWsrKJsYo3XQK6siRKA,A0X1baHPgw9IiBRivu0G9g,5.0,13
42,"I loved this place. I had the turkey, ham and ...",_i3IZYU28R3xgV4hxkmKjw,A0X1baHPgw9IiBRivu0G9g,5.0,13
47,"Cake, macaroons, and more oh my! This place is...",fxrQSMhHlBv6eJVRdchG9Q,A0X1baHPgw9IiBRivu0G9g,5.0,13


In [24]:
# size of each cluster
df2['cluster'].value_counts()

18    5325
1     5035
2     3605
9     3415
14    3050
15    2796
3     2792
10    2738
8     2697
17    2591
13    2475
7     2396
0     1876
5     1846
4     1747
11    1424
12    1192
16    1049
19     791
6      506
Name: cluster, dtype: int64

In [25]:
business_cluster_mostcount = df2['cluster'].groupby(df2['business_id']).agg(lambda x:x.value_counts().index[0])
len(business_cluster_mostcount)

1620

In [26]:
label = business_cluster_mostcount.tolist()
len(label)

1620

In [27]:
business_cluster = {'business_id': unique_business, 'cluster': label}
df_business_cluster = pd.DataFrame(business_cluster, columns = {'business_id', 'cluster'})
df_business_cluster.head(10)

Unnamed: 0,cluster,business_id
0,15,a192hdM0_UVCYLwPJv1Qwg
1,1,dVhGY-mNwTWQzK01Zxuclw
2,4,9_OPjioLbD9Q_zAeHfv6kw
3,10,DLJCwHGtbAV31UQ1lnIjlw
4,7,xiAiahsdBfTxYCFfvZmMfQ
5,10,DbEszO3wk1xVmN3pCPob2g
6,7,dQ4pxV6AxBaD3X4gssg5zg
7,16,tJ9-u9MfpVbX4X2miIJ71w
8,14,x71EvWJDVKZfM-OiYDWZ6A
9,15,ZCQa7CJxZ-53Zxd_pobWug


In [28]:
df_business_cluster['cluster'].value_counts()

18    177
1     159
14    133
2     113
3     108
9     103
7      90
15     84
8      78
10     77
5      72
13     68
17     66
4      61
0      57
12     48
11     45
16     30
19     30
6      21
Name: cluster, dtype: int64

In [29]:
##### BASELINE for RC and UC #####
rc_score = []
uc_score = []


from pandas import merge
for i in range(1):
    from random import randint
    restaurant_cluster_random = []
    user_cluster_random = []
    for i in range(len(unique_business)):
        restaurant_cluster_random.append(randint(0,K_CLUSTERS_RESTAURANT))
    
    #for i in range(len(top_users)):
    #    user_cluster_random.append(randint(0,K_CLUSTERS_USER))

    temp_business = df_business_cluster
    #temp_user = df_user_cluster
    temp_business['restaurant_cluster_random'] = restaurant_cluster_random
    #temp_user['user_cluster_random'] = user_cluster_random
    final = merge(df2, temp_business[['business_id', 'restaurant_cluster_random']],on='business_id')
    #final = merge(df2, temp_user[['user_id', 'user_cluster_random']],on='user_id')

    #Restuarant Cluster Evaluation Metric (baseline)
    mse_restaurant = [0] * K_CLUSTERS_RESTAURANT;

    for restaurant in unique_business:
        for cluster in range(K_CLUSTERS_RESTAURANT):
            ratings = final[final['business_id']==restaurant][final['restaurant_cluster_random']==cluster]['stars']
            if len(ratings) > 1:
                mean = sum(ratings) / len(ratings)
                mse = sum((ratings - mean) ** 2)
                mse_restaurant[cluster] += mse

    # User Cluster Evaluation Metric (baseline)
    #mse_user = [0] * K_CLUSTERS_USER;

    #for restaurant in unique_business:
     #   for cluster in range(K_CLUSTERS_USER):
      #      ratings = final[final['business_id']==restaurant][final['user_cluster_random']==cluster]['stars']
       #     if len(ratings) > 1:
        #        mean = sum(ratings) / len(ratings)
         #       mse = sum((ratings - mean) ** 2)
          #      mse_user[cluster] += mse

    rc_score.append(sum(mse_restaurant))
    #uc_score.append(sum(mse_user))
    
baseline_rc = sum(rc_score)/len(rc_score)
#baseline_uc = sum(uc_score)/len(uc_score)
print(baseline_rc)
#print(baseline_uc)



41585.3222055


In [30]:
##### Current Score for RC and UC #####
mse_restaurant = [0] * K_CLUSTERS_RESTAURANT;

for restaurant in unique_business:
    for cluster in range(K_CLUSTERS_RESTAURANT):
        ratings = df2[df2['business_id']==restaurant][df2['cluster']==cluster]['stars']
        if len(ratings) > 1:
            mean = sum(ratings) / len(ratings)
            mse = sum((ratings - mean) ** 2)
            mse_restaurant[cluster] += mse


#mse_user = [0] * K_CLUSTERS_USER;

#for restaurant in unique_business:
 #   for cluster in range(K_CLUSTERS_USER):
  #      ratings = df2[df2['business_id']==restaurant][df2['cluster']==cluster]['stars']
   #     
    #    if len(ratings) > 1:
     #       mean = sum(ratings) / len(ratings)
      #      mse = sum((ratings - mean) ** 2)
       #     mse_user[cluster] += mse

            
current_rc = sum(mse_restaurant)
#current_uc = sum(mse_user)
print(current_rc)
#print(current_uc)

  


43934.6818751
