In [1]:
import json
import pandas as pd
import itertools
import csv
import pickle
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.decomposition import PCA



In [2]:
file_path = '/Users/xzhou/github/project_files/yelp/yelp_{}.csv'

business_file = file_path.format('business')
review_file = file_path.format('review')


In [3]:
df_business = pd.read_csv(business_file)

In [4]:
df_business.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...


In [5]:
df_business.shape

(174567, 13)

In [6]:
df_business.groupby('categories')  \
           .business_id.count()  \
           .reset_index()  \
           .sort_values('business_id',ascending=False)

Unnamed: 0,categories,business_id
61899,Restaurants;Pizza,990
54518,Pizza;Restaurants,987
28347,Food;Coffee & Tea,978
48634,Nail Salons;Beauty & Spas,936
16697,Coffee & Tea;Food,929
10115,Beauty & Spas;Nail Salons,909
47384,Mexican;Restaurants,906
61417,Restaurants;Mexican,864
59299,Restaurants;Chinese,857
9645,Beauty & Spas;Hair Salons,839


In [7]:
df_business.groupby('city')  \
           .business_id.count()  \
           .reset_index()  \
           .sort_values('business_id',ascending=False)

Unnamed: 0,city,business_id
424,Las Vegas,26775
713,Phoenix,17213
983,Toronto,17206
143,Charlotte,8553
871,Scottsdale,8228
733,Pittsburgh,6355
532,Mesa,5760
573,Montréal,5709
340,Henderson,4465
969,Tempe,4263


In [8]:
reviews = pd.read_csv(review_file, iterator=True, chunksize=1000000)

In [9]:
df_review = pd.concat(reviews, ignore_index=True)

In [10]:
df_review.shape

(5261668, 9)

In [11]:
df_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [12]:
left = df_review
right = df_business
df_review_business = pd.merge(left, right, how='left', on='business_id')

In [13]:
df_review_business = df_review_business.rename(index=str, 
                                               columns={"stars_x": "indi_customer_rating",
                                                        "stars_y": "avg_rating",
                                                        "text": "review"})


In [14]:
df_review_business.shape

(5261668, 21)

In [15]:
df_review_business.columns

Index(['review_id', 'user_id', 'business_id', 'indi_customer_review', 'date',
       'text', 'useful', 'funny', 'cool', 'name', 'neighborhood', 'address',
       'city', 'state', 'postal_code', 'latitude', 'longitude', 'avg_review',
       'review_count', 'is_open', 'categories'],
      dtype='object')

In [16]:
# Drop the columns that will no longer be used

columns_to_drop = ['review_id', 'user_id', 'business_id', 'neighborhood', 'address', 'useful', 'funny', 'cool']
df_review_business = df_review_business.drop(columns_to_drop, axis=1)

In [17]:
# Focus on Restaurant Business

df_review_business = df_review_business[df_review_business['categories'].str
                                        .contains('Restaurants')]

In [18]:
# Focus on Las Vegas restaurant

df_review_business = df_review_business[df_review_business['city'] == 'Las Vegas']

In [19]:
# Focus on top 1000 categories

n = 1000
top_n = df_review_business.categories.value_counts().index[:n]
print(top_n)

df_review_business = df_review_business[df_review_business['categories'].isin(top_n)]
df_review_business.shape

Index(['Mexican;Restaurants', 'Pizza;Restaurants', 'Buffets;Restaurants',
       'Restaurants;Mexican',
       'French;Steakhouses;Restaurants;Breakfast & Brunch',
       'Sandwiches;Buffets;Restaurants;Breakfast & Brunch;Food',
       'Restaurants;Italian', 'Italian;Restaurants', 'Restaurants;Steakhouses',
       'Buffets;Breakfast & Brunch;Restaurants',
       ...
       'Fast Food;Restaurants;American (Traditional)',
       'American (New);Restaurants;Pubs;Nightlife;Sports Bars;Bars;Food;Pizza;Breweries',
       'Pizza;Specialty Food;Desserts;Restaurants;Food;Pasta Shops;Sandwiches',
       'African;Food;Specialty Food;American (New);Restaurants',
       'Breakfast & Brunch;Restaurants;American (Traditional);Seafood',
       'Restaurants;Chinese;Bubble Tea;Food',
       'American (Traditional);Tapas/Small Plates;Nightlife;Desserts;Wine Bars;Cocktail Bars;Bars;Food;Coffee & Tea;American (New);Tapas Bars;Restaurants',
       'Tex-Mex;Nightlife;Mexican;Caterers;Event Planning & Service

(767545, 13)

In [20]:
# Focus on business that is open and have meanintful number of reviews

df_review_business = df_review_business[df_review_business['is_open']==1]
df_review_business = df_review_business[df_review_business['review_count']>=5]

In [21]:
df_review_business.shape

(677003, 13)

In [22]:
df_review_business.groupby('categories')  \
                  .indi_customer_review.count()  \
                  .reset_index()  \
                  .sort_values('indi_customer_review',ascending=False)  \

Unnamed: 0,categories,indi_customer_review
526,Pizza;Restaurants,10334
457,Mexican;Restaurants,9637
188,Buffets;Restaurants,8075
361,French;Steakhouses;Restaurants;Breakfast & Brunch,7362
757,Sandwiches;Buffets;Restaurants;Breakfast & Bru...,7006
673,Restaurants;Mexican,6515
186,Buffets;Breakfast & Brunch;Restaurants,6259
398,Italian;Restaurants,6157
715,Restaurants;Steakhouses,5739
24,American (New);Restaurants;Breakfast & Brunch,5445


In [23]:
df_review_business.isnull().sum()

indi_customer_review    0
date                    0
text                    0
name                    0
city                    0
state                   0
postal_code             5
latitude                0
longitude               0
avg_review              0
review_count            0
is_open                 0
categories              0
dtype: int64

In [26]:
df_review_business.dropna(subset=['postal_code'], axis=0, inplace=True)

In [27]:
df_review_business.isnull().sum()

indi_customer_review    0
date                    0
text                    0
name                    0
city                    0
state                   0
postal_code             0
latitude                0
longitude               0
avg_review              0
review_count            0
is_open                 0
categories              0
dtype: int64

In [29]:
pkl_path = '/Users/xzhou/github/project_files/yelp/yelp_restaurant_reviews_clean1.pkl'

with open(pkl_path, 'wb') as pklfile:
    df_review_business.to_pickle(pklfile)

In [30]:
df_review_business['review_words'] = df_review_business['text'].apply(nltk.word_tokenize)

In [31]:
df_review_business['word_count'] = df_review_business['review_words'].apply(len)

In [32]:
df_review_business.head()

Unnamed: 0,indi_customer_review,date,text,name,city,state,postal_code,latitude,longitude,avg_review,review_count,is_open,categories,review_words,word_count
123,5,2012-08-23,We got recommendations for this place from my ...,"""Eiffel Tower""",Las Vegas,NV,89109,36.112477,-115.172189,4.0,1549,1,Steakhouses;Nightlife;Bars;French;Restaurants;...,"[We, got, recommendations, for, this, place, f...",217
181,3,2013-02-06,I've eaten here numerous times and am still am...,"""Roberto's Taco Shop""",Las Vegas,NV,89117,36.158426,-115.292048,3.0,58,1,Fast Food;Restaurants;Mexican,"[I, 've, eaten, here, numerous, times, and, am...",89
187,5,2014-04-27,I was looking for a nice place to take the fam...,"""Carve""",Las Vegas,NV,89122,36.107901,-115.056372,4.0,92,1,Steakhouses;Restaurants,"[I, was, looking, for, a, nice, place, to, tak...",231
194,2,2013-04-01,I live very near this place and have been curi...,"""Bachi Burger""",Las Vegas,NV,89117,36.14691,-115.300777,4.0,1068,1,Asian Fusion;Burgers;American (New);Bars;Wine ...,"[I, live, very, near, this, place, and, have, ...",248
195,2,2013-01-23,I live very close to this place and when I nee...,"""Jack In the Box""",Las Vegas,NV,89117,36.146731,-115.298293,2.5,35,1,Burgers;Fast Food;Restaurants,"[I, live, very, close, to, this, place, and, w...",111


In [36]:
# Drill down to comments that have meaningful length

df_review_business = df_review_business [df_review_business['word_count']>=10]

In [37]:
df_review_business.shape

(676130, 15)

In [12]:
# Noted duplicates in reviews and need to be dropped

df_review_business.review.value_counts()

The best burger I have ever had.  I have been to In-N-Out burgers all over the place.  They are all excellent.  The burgers are so well seasoned, the burger is fresh and the fries top it off perfectly.  I think they don't need salt added, but some people think it helps.  They are much less expensive than my second favorite place and so much better.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [13]:
df_review_business = df.drop_duplicates(['review'], keep='first')

In [14]:
df_review_business.shape

(676003, 15)

In [15]:
pkl_path = '/Users/xzhou/github/project_files/yelp/yelp_lv_restaurant_reviews.pkl'

try:
    with open(pkl_path, 'rb') as pkl_file:
        df_review_business = pd.read_pickle(pkl_file)
except:
    with open(pkl_path, 'wb') as pkl_file:
        df_review_business.to_pickle(pkl_file)