In [1]:
# LOADING THE YELP DATA SETS AND FILTERING THE FILES TO JUST RESTAURANTS IN ARIZONA 
# File is broken out as some of the steps take a number of steps to process. 
# Filters three files: (1) Businesses, (2) Reviews & (3) Users 

In [2]:
# Reference url 
# https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88

In [3]:
import pandas as pd

In [4]:
# BUSINESS 

In [5]:
business_json_path = './business.json'
business = pd.read_json(business_json_path, lines=True)

In [6]:
business.shape

(192609, 14)

In [7]:
business.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",


In [8]:
business.state.value_counts()

AZ     56686
NV     36312
ON     33412
NC     14720
OH     14697
PA     11216
QC      9219
AB      8012
WI      5154
IL      1932
SC      1162
NY        22
CA        19
TX         6
FL         4
XGM        4
CT         3
WA         3
AL         3
NE         2
GA         2
XWY        2
VA         2
AK         2
VT         2
DUR        1
AR         1
XGL        1
BAS        1
NJ         1
UT         1
BC         1
NM         1
CON        1
DOW        1
TN         1
Name: state, dtype: int64

In [12]:
business_oh = business[business.state == 'OH']
business_oh.shape

(14697, 14)

In [13]:
oh_b_list = business_az.business_id.tolist()
len(oh_b_list)

14697

In [14]:
# Check for null value
business_oh.isnull().sum()

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      2094
categories        32
hours           3552
dtype: int64

In [18]:
# check all business_ids are unique
business_oh.business_id.nunique()

14697

In [22]:
len(business_oh.postal_code.value_counts())

139

In [23]:
# Export csv 
business_oh.to_csv (r'./business_oh.csv', index = None, header=True)

In [25]:
# REVIEWS

In [26]:
# Use chunking to import Yelp user reviews

review_json_path = './review.json'
    
appended_data = []
for i in pd.read_json(review_json_path, lines=True, chunksize=500000):
    appended_data.append(i)
reviews = pd.concat(appended_data)
reviews.shape

(6685900, 9)

In [28]:
# Check the head 
reviews.shape

(6685900, 9)

In [29]:
# drop rows that are for businesses that are not located in AZ and have the string restaurant
reviews_oh = reviews[reviews['business_id'].isin(oh_b_list)]

In [30]:
reviews_oh.shape

(321345, 9)

In [31]:
reviews_oh.head(1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
8,qrffudO73zsslZbe8B9D3Q,sG_h0dIzTKWa3Q6fmb4u-g,b2jN2mm9Wf3RcrZCgfo1cg,2,1,0,0,I was really looking forward to visiting after...,2015-01-18 14:04:18


In [32]:
# Export to csv 
reviews_oh.to_csv (r'./reviews_oh.csv', index = None, header=True) 

In [52]:
import numpy as np

In [56]:
user_oh_list = reviews_oh.user_id.unique().tolist()
len(user_oh_list)

92622

In [33]:
# USERS

In [34]:
# Extract the user date
user_json_path = './user.json'
    
appended_data = []
for i in pd.read_json(user_json_path, lines=True, chunksize=500000):
    appended_data.append(i)
users = pd.concat(appended_data)
users.shape

(1637138, 22)

In [36]:
users.shape

(1637138, 22)

In [37]:
users.head(1)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,l6BmjZMeQD3rDxWUbiAiow,Rashmi,95,2013-10-08 23:11:33,84,17,25,201520162017,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",5,...,0,0,0,0,1,1,1,1,2,0


In [57]:
# Generate the list of users who wrote restauarant reviews in Arizona
users_oh = users[users['user_id'].isin(user_oh_list)]

In [58]:
# check the shape
users_oh.shape

(92622, 22)

In [59]:
# check the head
users_oh.head(1)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,l6BmjZMeQD3rDxWUbiAiow,Rashmi,95,2013-10-08 23:11:33,84,17,25,201520162017,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",5,...,0,0,0,0,1,1,1,1,2,0


In [60]:
# Export to csv 
users_oh.to_csv (r'./users_oh.csv', index = None, header=True) 