In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.express as px

In [23]:
### Data Cleaning Methods

def filter_business_data(business_dataset):
    '''
    Input Yelp business data, output all restaurants with >= 30 reviews
    '''
    
    # Read yelp business data into dataframe
    unfiltered_business_df = pd.read_json(business_dataset, lines=True)
    # Select all businesses with "Restaurants" category
    restaurants_df = unfilteredBusinessData[(unfilteredBusinessData["categories"].str.contains("restaurants", case=False, na=False))]
    # Select all restaurants with > 30 reviews
    reviewed_restaurants = restaurants_df[(restaurants_df["review_count"] >= 30)]

    return reviewed_restaurants

def filter_review_data(review_dataset, filtered_restaurants, chunk_size):
    """
    Input Yelp review data and chunk size (number of reviews to read into memory at one time).
    Output all reviews about restaurants returned from filter_business_data
    """
    
    # Read yelp review data info dataframe
    unfilteredReviews = pd.DataFrame()
    for chunk in pd.read_json(, lines=True, chunksize=100000):
        unfilteredReviews = pd.concat([unfilteredReviews, chunk])
    # Select only reviews of businesses in our filtered dataset
    reviewed_restaurant_mask = unfilteredReviews['business_id'].isin(reviewed_restaurants['business_id'])
    filtered_reviews = unfilteredReviews[reviewed_restaurant_mask]

    return filtered_reviews



In [22]:
## Testing restaurant df
restaurants_df = filter_business_data("data/yelp_academic_dataset_business.json")
restaurants_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27893 entries, 3 to 150336
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   27893 non-null  object 
 1   name          27893 non-null  object 
 2   address       27893 non-null  object 
 3   city          27893 non-null  object 
 4   state         27893 non-null  object 
 5   postal_code   27893 non-null  object 
 6   latitude      27893 non-null  float64
 7   longitude     27893 non-null  float64
 8   stars         27893 non-null  float64
 9   review_count  27893 non-null  int64  
 10  is_open       27893 non-null  int64  
 11  attributes    27887 non-null  object 
 12  categories    27893 non-null  object 
 13  hours         26537 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 3.2+ MB


In [24]:
## Testing review df
chunk_size = 100000
reviews_df = filter_review_data("data/yelp_academic_dataset_review.json", restaurants_df, chunk_size)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4371248 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 333.5+ MB


In [4]:
# Read yelp user data into dataframe
unfilteredUsers=pd.DataFrame()

userReviewReader=pd.read_json("data/yelp_academic_dataset_user.json", lines=True, chunksize=)
for chunk in userReviewReader:
    unfilteredUsers = pd.concat([unfilteredUsers, chunk])

In [5]:
# Examine user data
unfilteredUsers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB


In [18]:
# Examine review data
# unfilteredReviews.info()



<class 'pandas.core.frame.DataFrame'>
Index: 4371248 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 333.5+ MB
