In [1]:
import pandas as pd
import numpy as np

In [9]:
### Data Cleaning Methods

def load(json, chunk_size):
    """
    Load a large JSON into a dataframe, chunk_size rows at a time
    """
    dataframe = pd.DataFrame()
    for chunk in pd.read_json(json, lines=True, chunksize=chunk_size):
        dataframe = pd.concat([dataframe, chunk])

    return dataframe
    

def filter_business_data(business_dataset, chunk_size):
    '''
    Input Yelp business data and chunk_size, output all restaurants with >= 30 reviews
    '''
    # Read yelp business data into dataframe
    unfiltered_business_df = load(business_dataset, chunk_size)
    # Select all businesses with "Restaurants" category
    restaurants_df = unfiltered_business_df[(unfiltered_business_df["categories"].str.contains("restaurants", case=False, na=False))]
    # Select all restaurants with > 30 reviews
    reviewed_restaurants = restaurants_df[(restaurants_df["review_count"] >= 30)]

    return reviewed_restaurants


def filter_review_data(review_dataset, reviewed_restaurants, chunk_size):
    """
    Input Yelp review data and chunk size (number of reviews to read into memory at one time).
    Output all reviews about restaurants returned from filter_business_data
    """
    # Read yelp review data into dataframe
    unfiltered_reviews = load(review_dataset, chunk_size)
    # Select only reviews of businesses in our filtered dataset
    reviewed_restaurant_mask = unfiltered_reviews['business_id'].isin(reviewed_restaurants['business_id'])
    filtered_reviews = unfiltered_reviews[reviewed_restaurant_mask]

    return filtered_reviews


def filter_user_data(users_dataset, filtered_reviews, chunk_size):
    """
    Input Yelp user data, filtered review data, and chunk size (number of users to read into memory at a time).
    Output all users who have written reviews about restaurants returned from filter_business_data
    """
    # Read yelp user data into dataframe
    unfiltered_users = load(users_dataset, chunk_size)
    # Select only users who have written the reviews in our filtered dataset
    filtered_users = unfiltered_users[unfiltered_users['user_id'].isin(filtered_reviews['user_id'])]

    return filtered_users

In [5]:
## Testing restaurant df
chunk_size = 100000
restaurants_df = filter_business_data("data/yelp_academic_dataset_business.json", chunk_size)
restaurants_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27893 entries, 3 to 150336
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   27893 non-null  object 
 1   name          27893 non-null  object 
 2   address       27893 non-null  object 
 3   city          27893 non-null  object 
 4   state         27893 non-null  object 
 5   postal_code   27893 non-null  object 
 6   latitude      27893 non-null  float64
 7   longitude     27893 non-null  float64
 8   stars         27893 non-null  float64
 9   review_count  27893 non-null  int64  
 10  is_open       27893 non-null  int64  
 11  attributes    27887 non-null  object 
 12  categories    27893 non-null  object 
 13  hours         26537 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 3.2+ MB


In [8]:
## Testing review df
chunk_size = 100000
reviews_df = filter_review_data("data/yelp_academic_dataset_review.json", restaurants_df, chunk_size)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4371248 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 333.5+ MB


In [10]:
## Testing users df
chunk_size = 100000
reviews_df = filter_user_data("data/yelp_academic_dataset_user.json", reviews_df, chunk_size)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1379968 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   user_id             1379968 non-null  object 
 1   name                1379968 non-null  object 
 2   review_count        1379968 non-null  int64  
 3   yelping_since       1379968 non-null  object 
 4   useful              1379968 non-null  int64  
 5   funny               1379968 non-null  int64  
 6   cool                1379968 non-null  int64  
 7   elite               1379968 non-null  object 
 8   friends             1379968 non-null  object 
 9   fans                1379968 non-null  int64  
 10  average_stars       1379968 non-null  float64
 11  compliment_hot      1379968 non-null  int64  
 12  compliment_more     1379968 non-null  int64  
 13  compliment_profile  1379968 non-null  int64  
 14  compliment_cute     1379968 non-null  int64  
 15  compliment_list     