In [1]:
import pandas as pd

### Converting .json to .csv

* Converts business.json, reviews.json, and users.json to .csv files
* Creates subsets for only data related to restaurants 
    * We look for the business_ids which have "restaurant" as an attribute and use them to index the business, reviews, and user datasets

### Business.json

In [4]:
## We can just read the business data as it because it is not large enough to require using chunks 
business_data = pd.read_json('./yelp_academic_dataset_business.json', lines=True)
print(business_data.shape)

#business_data.to_csv('./business_data.csv', index=False)

(150346, 14)


In [5]:
## checking to make sure data saved properly 
business_data = pd.read_csv('./business_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: './business_data.csv'

In [6]:
business_data.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
## Finding which businesses have "restaurant" as an attribute and creating another dataframe with just those businesses
restaurants = business_data[business_data.categories.fillna('-').str.lower().str.contains('restaurant')]
restaurant_ids = restaurants['business_id'].tolist() # list of restaurant ids 
#restaurants.shape
#restaurants.to_csv('./restaurants_business_data.csv', index=False)

In [None]:
restaurants.head()

### Reviews.json

In [None]:
## Iterates through the review.json in chunks of 50,000 lines and appends the chunks to one dataframe 

chunk_size=50000
reviews = pd.DataFrame() 

for i, chunk in enumerate(pd.read_json('./yelp_academic_dataset_review.json', lines=True, chunksize=chunk_size)):
    print('******* {} ****** '.format(i))
    reviews = pd.concat([reviews, chunk], axis=0, join="outer")

reviews.to_csv('./reviews_data.csv', index=False)

In [None]:
reviews.head(5)

In [None]:
## creates a dataframe with the reviews only for restaurants 
restaurants_subset = reviews[reviews['business_id'].isin(restaurant_ids)]
#len(restaurants_subset)
#restaurants_subset.to_csv('./restaurant_reviews_data.csv', index=False)
restaurants_subset.head(5)

In [18]:
restaurants_subset = pd.read_csv('restaurant_reviews_data.csv')
user_ids = restaurants_subset['user_id']

### Users.json

In [None]:
## Iterates through the user.json in chunks of 50,000 lines and appends the chunks to one dataframe 

chunk_size=50000
users = pd.DataFrame() 

for i, chunk in enumerate(pd.read_json('./yelp_academic_dataset_user.json', lines=True, chunksize=chunk_size)):
    print('******* {} ****** '.format(i))
    #print(chunk)
    users = pd.concat([users, chunk], axis=0, join="outer")
    #print(users)

#users.to_csv('./users_data.csv')

In [22]:
## finds which users have rated restaurants and creates a separate dataframe for them 
users_subset = users[users['user_id'].isin(user_ids)]
#users_subset.to_csv('./restaurant_users_data.csv')