In [1]:
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('rmfake/reviews_without_fake.csv')

In [3]:
df.business_id.value_counts().describe()

count    31418.000000
mean        22.184894
std         54.295772
min          1.000000
25%          4.000000
50%          8.000000
75%         20.000000
max       2862.000000
Name: business_id, dtype: float64

In [4]:
tmp = df[['business_id', 'pred_stars']].groupby('business_id').count().reset_index()
tmp['stars'] = df[['business_id', 'stars']].groupby('business_id').mean().values
tmp['sentiment_polarity'] = df[['business_id', 'sentiment_polarity']].groupby('business_id').mean().values
tmp['sentiment_subjectivity'] = df[['business_id', 'sentiment_subjectivity']].groupby('business_id').mean().values
tmp.columns = ['business_id', 'count', 'stars', 'sentiment_polarity', 'sentiment_subjectivity']
tmp = tmp[tmp['count'] > 10]
tmp

Unnamed: 0,business_id,count,stars,sentiment_polarity,sentiment_subjectivity
2,--ZVrH2X2QXBFdCilbirsw,26,4.769231,0.313075,0.532050
4,--epgcb7xHGuJ-4PUeSLAw,17,2.882353,0.149582,0.522521
5,--sXnWH9Xm6_NvIjyuA99w,15,3.666667,0.309164,0.531157
7,-0FX23yAacC4bbLaGPvyxw,39,3.358974,0.255454,0.602752
10,-0TffRSXXIlBYVbb5AwfTg,458,4.497817,0.349077,0.632810
...,...,...,...,...,...
31402,zyeDagSsnfdVUWIvt8WiqA,15,2.333333,0.049929,0.546744
31409,zz-fcqurtm77bZ_rVvo2Lw,17,4.647059,0.323524,0.542740
31410,zz3E7kmJI2r2JseE6LAnrw,32,3.625000,0.175517,0.467161
31413,zzbpcMZXHoZxUr9JZdH6wg,14,1.571429,0.045335,0.401101


In [5]:
# reviews per week
df['week'] = pd.DatetimeIndex(df['date']).week
tt = df.groupby(['business_id', 'week']).count().reset_index()[['business_id', 'review_id']]
tt = tt.groupby('business_id').mean().reset_index()
tt.columns = ['business_id', 'reviews_per_week']
tmp = tmp.merge(tt, on = 'business_id', how='left')
# df.drop(columns='week', inplace=True)

In [6]:
# oldest review year
oldest_review = df[['business_id', 'year']].groupby('business_id').min().reset_index()
oldest_review.columns = ['business_id', 'oldest_review_year']
tmp = tmp.merge(oldest_review, on = 'business_id', how='left')

In [7]:
# std of stars
std_of_stars = df[['business_id', 'stars']].groupby('business_id').std().reset_index()
std_of_stars.columns = ['business_id', 'std_of_stars']
tmp = tmp.merge(std_of_stars, on = 'business_id', how='left')

In [8]:
# median of stars
med_of_stars = df[['business_id', 'stars']].groupby('business_id').median().reset_index()
med_of_stars.columns = ['business_id', 'med_of_stars']
tmp = tmp.merge(med_of_stars, on = 'business_id', how='left')

In [9]:
tmp

Unnamed: 0,business_id,count,stars,sentiment_polarity,sentiment_subjectivity,reviews_per_week,oldest_review_year,std_of_stars,med_of_stars
0,--ZVrH2X2QXBFdCilbirsw,26,4.769231,0.313075,0.532050,1.181818,2007,0.651625,5.0
1,--epgcb7xHGuJ-4PUeSLAw,17,2.882353,0.149582,0.522521,1.214286,2012,1.932691,3.0
2,--sXnWH9Xm6_NvIjyuA99w,15,3.666667,0.309164,0.531157,1.071429,2011,1.799471,5.0
3,-0FX23yAacC4bbLaGPvyxw,39,3.358974,0.255454,0.602752,1.695652,2018,1.769436,3.0
4,-0TffRSXXIlBYVbb5AwfTg,458,4.497817,0.349077,0.632810,8.807692,2013,1.058980,5.0
...,...,...,...,...,...,...,...,...,...
13047,zyeDagSsnfdVUWIvt8WiqA,15,2.333333,0.049929,0.546744,1.250000,2008,1.632993,1.0
13048,zz-fcqurtm77bZ_rVvo2Lw,17,4.647059,0.323524,0.542740,1.214286,2012,0.785905,5.0
13049,zz3E7kmJI2r2JseE6LAnrw,32,3.625000,0.175517,0.467161,1.600000,2007,1.288911,3.0
13050,zzbpcMZXHoZxUr9JZdH6wg,14,1.571429,0.045335,0.401101,1.076923,2014,1.222500,1.0


In [10]:
data_file = open("data/yelp_academic_dataset_business.json", encoding = "utf8")
business_data = []
for line in data_file:
    business_data.append(json.loads(line))

business_df = pd.DataFrame(business_data)

data_file.close()

In [11]:
business_df = business_df[business_df['state']=='PA']

In [12]:
business_df.drop(columns=['address', 'state', 'stars', 'review_count'], inplace=True)

In [16]:
business_df = pd.merge(business_df, tmp, on='business_id')

In [18]:
# convert dictionary in the column 'attributes' and 'hours' to labels
business_df = pd.concat([business_df, business_df["attributes"].apply(pd.Series), business_df["hours"].apply(pd.Series)], axis=1)
business_df.drop(columns=['attributes', 'hours'], inplace=True)

In [19]:
for i in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    business_df[f'{i}'] = business_df[f'{i}'].apply(lambda x: 1 if type(x) == str else 0)

In [20]:
# getting just restaurants
business_df = business_df[business_df['categories'].str.contains('Restaurant.*')==True]

In [21]:
# business_df['categories'] = business_df['categories'].str.split(', ')
# get dummies from categories
df_categories_dummies = pd.Series(business_df['categories']).str.get_dummies(',')

business_df = pd.concat([business_df, df_categories_dummies], axis=1)
business_df.drop(columns='categories', inplace=True)

In [22]:
business_df.head()

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,is_open,count,stars,sentiment_polarity,...,Vegetarian,Venues & Event Spaces,Vietnamese,Waffles,Wedding Planning,Wheel & Rim Repair,Wine Bars,Wineries,Wraps,Yoga
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,19107,39.955505,-75.155564,1,28,3.785714,0.238894,...,0,0,0,0,0,0,0,0,0,0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,Philadelphia,19106,39.953949,-75.143226,1,86,4.302326,0.34371,...,0,0,0,0,0,0,0,0,0,0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,Philadelphia,19147,39.943223,-75.162568,1,98,4.489796,0.29876,...,0,0,0,0,0,0,0,0,0,0
3,QdN72BWoyFypdGJhhI5r7g,Bar One,Philadelphia,19147,39.939825,-75.157447,0,33,4.090909,0.277549,...,0,0,0,0,0,0,0,0,0,0
4,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,Philadelphia,19127,40.022466,-75.218314,0,30,2.4,0.008336,...,0,0,0,0,0,0,0,0,0,0


In [24]:
business_df.to_csv('step2_data/combined.csv')