In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
### Data Cleaning Methods

def filter_business_data(business_dataset):
    '''
    Input Yelp business data, output all restaurants with >= 30 reviews
    '''
    # Read yelp business data into dataframe
    unfiltered_business_df = pd.read_json(business_dataset, lines=True)
    # Select all businesses with "Restaurants" category
    restaurants_df = unfiltered_business_df[(unfiltered_business_df["categories"].str.contains("restaurants", case=False, na=False))]
    # Select all restaurants with > 30 reviews
    reviewed_restaurants = restaurants_df[(restaurants_df["review_count"] >= 30)]

    return reviewed_restaurants


def filter_review_data(review_dataset, reviewed_restaurants, chunk_size):
    """
    Input Yelp review data and chunk size (number of reviews to read into memory at one time).
    Output all reviews about restaurants returned from filter_business_data
    """
    # Read yelp review data into dataframe
    unfiltered_reviews = pd.DataFrame()
    for chunk in pd.read_json(review_dataset, lines=True, chunksize=chunk_size):
        unfiltered_reviews = pd.concat([unfiltered_reviews, chunk])
    # Select only reviews of businesses in our filtered dataset
    reviewed_restaurant_mask = unfiltered_reviews['business_id'].isin(reviewed_restaurants['business_id'])
    filtered_reviews = unfiltered_reviews[reviewed_restaurant_mask]

    return filtered_reviews


def filter_user_data(users_dataset, filtered_reviews, chunk_size):
    """
    Input Yelp user data, filtered review data, and chunk size (number of users to read into memory at a time).
    Output all users who have written reviews about restaurants returned from filter_business_data
    """
    # Read yelp user data into dataframe
    unfiltered_users = pd.DataFrame()
    for chunk in pd.read_json(users_dataset, lines=True, chunksize=chunk_size):
        unfiltered_users = pd.concat([unfiltered_users, chunk])
    # Select only users who have written the reviews in our filtered dataset
    filtered_users = unfiltered_users[unfiltered_users['user_id'].isin(filtered_reviews['user_id'])]

    return filtered_users

In [4]:
## Testing restaurant df
restaurants_df = filter_business_data("data/yelp_academic_dataset_business.json")
restaurants_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27893 entries, 3 to 150336
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   27893 non-null  object 
 1   name          27893 non-null  object 
 2   address       27893 non-null  object 
 3   city          27893 non-null  object 
 4   state         27893 non-null  object 
 5   postal_code   27893 non-null  object 
 6   latitude      27893 non-null  float64
 7   longitude     27893 non-null  float64
 8   stars         27893 non-null  float64
 9   review_count  27893 non-null  int64  
 10  is_open       27893 non-null  int64  
 11  attributes    27887 non-null  object 
 12  categories    27893 non-null  object 
 13  hours         26537 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 3.2+ MB


In [7]:
## Testing review df
chunk_size = 100000
reviews_df = filter_review_data("data/yelp_academic_dataset_review.json", restaurants_df, chunk_size)
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31


In [6]:
## Testing users df
chunk_size = 100000
reviews_df = filter_user_data("data/yelp_academic_dataset_user.json", reviews_df, chunk_size)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB


In [13]:
# Examine user data
unfilteredUsers.head()
#unfilteredUsers.info()
reviews_df = 

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0
