The purpose of this notebook is to prepare the Yelp Dataset for classification task and demostration of text analytics

In [34]:
# import libraries
import ast
import os

import glob
import numpy as np
import pandas as pd

In [2]:
yelp_dir = 'data\Yelp' # data is stored in this directory
filenames = glob.glob(os.path.join(yelp_dir, 'yelp_academic_dataset*.csv'))
print filenames

['data\\Yelp\\yelp_academic_dataset_business.csv', 'data\\Yelp\\yelp_academic_dataset_review.csv']


In [3]:
# convert only the business dataset and the review dataset to DataFrame
business = pd.read_csv(filenames[0])
reviews = pd.read_csv(filenames[1])

# Preview the DF
print business.head()
print reviews.head()

  interactivity=interactivity, compiler=compiler, result=result)


  attributes.Ambience.divey attributes.Dietary Restrictions.vegan  \
0                     False                                   NaN   
1                       NaN                                   NaN   
2                       NaN                                   NaN   
3                      True                                   NaN   
4                       NaN                                   NaN   

  attributes.Happy Hour hours.Thursday.open attributes.Order at Counter  \
0                   NaN               11:00                         NaN   
1                  True                 NaN                         NaN   
2                   NaN                 NaN                         NaN   
3                 False               10:00                         NaN   
4                   NaN               11:00                         NaN   

  attributes.Hair Types Specialized In.africanamerican  \
0                                                NaN     
1                 

Merge the review data and the business data by 'business_id' by left out join.

In [4]:
reviews_merge = pd.merge(reviews, business, how='left', on=['business_id'])

# look at all the column names in the merged df.
reviews_merge.columns.values

array(['user_id', 'review_id', 'text', 'votes.cool', 'business_id',
       'votes.funny', 'stars_x', 'date', 'type_x', 'votes.useful',
       'attributes.Ambience.divey',
       'attributes.Dietary Restrictions.vegan', 'attributes.Happy Hour',
       'hours.Thursday.open', 'attributes.Order at Counter',
       'attributes.Hair Types Specialized In.africanamerican',
       'attributes.Hair Types Specialized In.kids', 'attributes.BYOB',
       'hours.Friday.open', 'attributes.Good For.latenight',
       'attributes.Outdoor Seating', 'attributes.Alcohol',
       'attributes.Ambience.classy', 'attributes.By Appointment Only',
       'attributes.Parking.lot', 'attributes.Ambience.touristy',
       'attributes.Corkage', 'hours.Tuesday.open',
       'attributes.Good For.brunch', 'categories',
       'attributes.Waiter Service', 'hours.Monday.open', 'name',
       'attributes.Parking.street', 'attributes.Ambience.hipster',
       'attributes.BYOB/Corkage',
       'attributes.Hair Types Special

In [5]:
# keep these columns
colnames = ['categories', 'text', 'stars_x']

output_df = reviews_merge[colnames]
print output_df.head()

                     categories  \
0  ['Fast Food', 'Restaurants']   
1  ['Fast Food', 'Restaurants']   
2  ['Fast Food', 'Restaurants']   
3  ['Fast Food', 'Restaurants']   
4  ['Fast Food', 'Restaurants']   

                                                text  stars_x  
0  Mr Hoagie is an institution. Walking in, it do...        4  
1  Excellent food. Superb customer service. I mis...        5  
2  Yes this place is a little out dated and not o...        5  
3  PROS: Italian hoagie was delicious.  Friendly ...        3  
4  First the only reason this place could possibl...        2  


In [6]:
# Change the column name for 'stars_x' to 'star'.
output_df = output_df.rename(columns=({ 'stars_x' : 'stars'}))

# See how many reviews are there
print output_df.shape

(2685066, 3)
                     categories  \
0  ['Fast Food', 'Restaurants']   
1  ['Fast Food', 'Restaurants']   
2  ['Fast Food', 'Restaurants']   
3  ['Fast Food', 'Restaurants']   
4  ['Fast Food', 'Restaurants']   

                                                text  stars  
0  Mr Hoagie is an institution. Walking in, it do...      4  
1  Excellent food. Superb customer service. I mis...      5  
2  Yes this place is a little out dated and not o...      5  
3  PROS: Italian hoagie was delicious.  Friendly ...      3  
4  First the only reason this place could possibl...      2  


The categories of the business is actually a string that has a _list_ structure.

In [7]:
# Number of categories in the dataset.
print len(output_df.categories.unique()), 'unique categories'

11001 unique categories


Because some reviews can be labeled with multiple categories, this makes review categorization hard. Hence, in the next code chunks, I am finding the number of categories a review belongs to, followed by filtering the output to contain only reviews with 1 business category

In [8]:
# convert the categories from string to a list and find the number of categories associated with that review.
def len_in_pseudo_list(string):
    myList = ast.literal_eval(string)
    return len(myList)

# the number of categories associated with each review is recorded in the 'num_cat' column.
output_df['num_cat'] = output_df.categories.apply(len_in_pseudo_list)
output_df.head()

Unnamed: 0,categories,text,stars,num_cat
0,"['Fast Food', 'Restaurants']","Mr Hoagie is an institution. Walking in, it do...",4,2
1,"['Fast Food', 'Restaurants']",Excellent food. Superb customer service. I mis...,5,2
2,"['Fast Food', 'Restaurants']",Yes this place is a little out dated and not o...,5,2
3,"['Fast Food', 'Restaurants']",PROS: Italian hoagie was delicious. Friendly ...,3,2
4,"['Fast Food', 'Restaurants']",First the only reason this place could possibl...,2,2


In [22]:
# Only want reviews on businesses associated with a category.
only_1_cat = output_df[output_df.num_cat==1]

# what is the size of the resulting DF?
print only_1_cat.shape

# how many categories are there now?
print len(only_1_cat.categories.unique())

(7704, 4)
21


Number of unique categories has reduced to 21.

How many reviews are there in each categories?

In [23]:
# number of reviews in each category
cat_grp = only_1_cat.groupby(['categories'])
num_reviews_per_cat = cat_grp.size()
print num_reviews_per_cat.sort_values(ascending=False) # arrange from largest number of reviews to least.

categories
['Restaurants']                     1919
['Local Flavor']                    1398
['Arts & Entertainment']             670
['Public Services & Government']     666
['Nightlife']                        526
['Shopping']                         510
['Food']                             336
['Active Life']                      299
['Local Services']                   271
['Professional Services']            213
['Health & Medical']                 139
['Beauty & Spas']                    137
['Hotels & Travel']                  136
['Event Planning & Services']         99
['Automotive']                        91
['Religious Organizations']           78
['Home Services']                     76
['Pets']                              65
['Financial Services']                43
['Education']                         17
['Mass Media']                        15
dtype: int64


In [24]:
# Extract the categories of the business with 1 category.
def cat_string(string):
    return ast.literal_eval(string)[0]
only_1_cat.categories = np.array(only_1_cat.categories.apply(cat_string))
print only_1_cat.head()

# what are the categories where the number of reviews is > 600 counts?
categories_by_ct = num_reviews_per_cat.sort_values(ascending=False)
wanted_cat = categories_by_ct[categories_by_ct.values > 600].index # the categories are still list in string format
wanted_cat = map(cat_string, wanted_cat) # convert those strings to list, followed by 
                                         # extracting the category contained in the string.
print wanted_cat

   categories                                               text  stars  \
7   Nightlife  All the food is great here. But the best thing...      5   
8   Nightlife  We checked this place out this past Monday for...      3   
9   Nightlife  Wing sauce is like water. Pretty much a lot of...      1   
10  Nightlife  Cold cheap beer. Good bar food. Good service. ...      4   
11  Nightlife  Possibly the most overhyped establishment in A...      2   

    num_cat  
7         1  
8         1  
9         1  
10        1  
11        1  
['Restaurants', 'Local Flavor', 'Arts & Entertainment', 'Public Services & Government']


In [29]:
# filter the data to those categories with > 600 reviews.
filtered_df = only_1_cat[only_1_cat.categories.apply(lambda x: x in wanted_cat)]
# remove the 'num_cat' column.
filtered_df = filtered_df.drop('num_cat', axis=1)
# output filename
output_filename = os.path.join(yelp_dir, 'review_with_single_cat.csv')
# output the df as csv
filtered_df.to_csv(output_filename, index=False)