In [1]:
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
import re

## Import Yelp Data

In [2]:
local = True

In [3]:
data_path = "../yelp_dataset/yelp_academic_dataset_"

business_fname = data_path + 'business.json'

if local:
    review_fname = data_path + 'review_short.json'
else:
    review_fname = data_path + 'review.json'
    
biz_df = pd.read_json(business_fname, lines=True)
print("business json attributes: ", list(biz_df.columns.values))

rev_df = pd.read_json(review_fname, lines=True)
print("review json attributes: ", list(rev_df.columns.values))

business json attributes:  ['address', 'attributes', 'business_id', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'postal_code', 'review_count', 'stars', 'state']
review json attributes:  ['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text', 'useful', 'user_id']


## Find Number of Empty Entries

In [4]:
print("business empty entries: ", biz_df.isnull().sum(axis=0))
print("review empty entries: ", rev_df.isnull().sum(axis=0))

# print("checkin empty entries: ", ci_df.isna().sum(axis=0))
# print("photo empty entries: ", ph_df.isna().sum(axis=0))
# print("tip empty entries: ", tip_df.isna().sum(axis=0))
# print("user empty entries: ", use_df.isna().sum(axis=0))

business empty entries:  address             0
attributes      29045
business_id         0
categories        524
city                0
hours           44843
is_open             0
latitude            0
longitude           0
name                0
postal_code         0
review_count        0
stars               0
state               0
dtype: int64
review empty entries:  business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64


## Sizes

In [5]:
print("business size: ", biz_df.shape)
print("review size: ", rev_df.shape)

# print("checkin size: ", ci_df.shape)
# print("photo size: ", ph_df.shape)

# print("tip size: ", tip_df.shape)
# print("user size: ", use_df.shape)

business size:  (209393, 14)
review size:  (100, 9)


### Join Review and Business

In [6]:
review_business_join_df = rev_df.join(biz_df.set_index('business_id'), on = 'business_id', lsuffix = '_review', rsuffix = "_biz")
review_business_join_df.to_csv("./output/review_business_join.csv", index = False)

In [7]:
review_business_join_df.columns.values

array(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars_review',
       'text', 'useful', 'user_id', 'address', 'attributes', 'categories',
       'city', 'hours', 'is_open', 'latitude', 'longitude', 'name',
       'postal_code', 'review_count', 'stars_biz', 'state'], dtype=object)

In [8]:
review_business_restaurant_df = review_business_join_df[review_business_join_df['categories'].str.contains("Restaurant", flags=re.IGNORECASE, regex = True)]
review_business_restaurant_df.to_csv("./output/review_business_restaurant.csv", index = False)

In [9]:
key_attributes = ["review_id", "business_id", "date", "categories", "name", "postal_code", "state", "text", "user_id", "stars_review", "stars_biz"]
review_business_restaurant_key_attr_df = review_business_restaurant_df.filter(key_attributes)
review_business_restaurant_key_attr_df.to_csv("./output/review_business_restaurant_key_attributes.csv", index = False)