In [1]:
import pandas as pd
import json

df = pd.read_json('./datasets/business_dataset_new_orleans.json', lines=True)

print(df.head(5))

              business_id                      name           address  \
0  M0XSSHqrASOnhgbWDJIpQA            Herb Import Co      712 Adams St   
1  w_AMNoI1iG9eay7ncmc67w                 River 127  100 Iberville St   
2  uczmbBk5O3tYhGue13dCDg  New Orleans Spirit Tours   723 St Peter St   
3  YNjyv0gfOr2g8lbmUpTnKg               Copper Vine   1001 Poydras St   
4  J_ksUDPpzPwfTGtI4zTRnQ            Riverview Room    600 Decatur St   

          city state postal_code   latitude  longitude  stars  review_count  \
0  New Orleans    LA       70118  29.941468 -90.129953    4.0             5   
1  New Orleans    LA       70130  29.951359 -90.064672    3.0            12   
2  New Orleans    LA       70130  29.958431 -90.065173    4.0            38   
3  New Orleans    LA       70112  29.950647 -90.074427    4.5           350   
4  New Orleans    LA       70130  29.955925 -90.062962    4.5             7   

   is_open                                         attributes  \
0        1  {'Busines

In [2]:
#count the number of businesses in each city
city_counts = df['city'].value_counts()

#print the top 10 cities by business count
print(city_counts.head(10))

city
New Orleans    6209
Name: count, dtype: int64


In [3]:
#create a JSON file with only the business in Reno
reno_business = df[df['city'] == 'Reno']
reno_business.to_json('./datasets/business_dataset_reno.json', orient='records', lines=True)

In [4]:
#create a JSON file with only the business in New Orleans
# new_orleans_business = df[df['city'] == 'New Orleans']
# new_orleans_business.to_json('./datasets/business_dataset_new_orleans.json', orient='records', lines=True)

file_name = "./datasets/business_dataset_new_orleans.json"
new_orleans_business = []

with open(file_name, 'r') as f:
    for line in f:
        business = json.loads(line.strip())
        
        if business['city'] == 'New Orleans':
            new_orleans_business.append(business)


In [5]:
#create a JSON file with only the business in Nashville
nashville_business = df[df['city'] == 'Nashville']
nashville_business.to_json('./datasets/business_dataset_nashville.json', orient='records', lines=True)

### User Filtering

In [6]:
def read_chunks(file, cols, chunk_size=500000):
    df = pd.read_json(
        path_or_buf=f'original_datasets/{file}.json', chunksize=chunk_size, lines=True
    )
    chunk_list = [chunk[cols] for chunk in df]
    
    return pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [7]:
users = read_chunks('yelp_academic_dataset_user', ['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos'])
print(users.head(5))

In [None]:
reviews = read_chunks('yelp_academic_dataset_review', ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'])
print(reviews.head(5))

In [None]:
# get the reviews for the businesses in New Orleans
new_orleans_reviews = reviews[reviews['business_id'].isin(new_orleans_business['business_id'])]
new_orleans_reviews.to_json('./datasets/review_dataset_new_orleans.json', orient='records', lines=True)

In [None]:
# get the users who have reviewed businesses in New Orleans
new_orleans_users = users[users['user_id'].isin(new_orleans_reviews['user_id'])]
new_orleans_users.to_json('./datasets/user_dataset_new_orleans.json', orient='records', lines=True)