In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
regular_reviews = pd.read_csv('data/reg_reviews_NLP.csv')
not_recommended_reviews= pd.read_csv('data/not_reviews_NLP.csv')

# Combine the datasets

New feature, 'regular', is added to distinguish regular review from not recommended reviews

- 1 represent regular reviews
- 0 represent not recommended reviews

In [3]:
regular_reviews['regular'] = 1
not_recommended_reviews['regular'] = 0
reviews = regular_reviews.append(not_recommended_reviews, ignore_index = True).dropna(axis=1)

## <span style="color:blue"> Data cleaning

- removed rows with missing values from original files
- dropped unnecessary columns for furthur analysis
- converted data types; 'has_photo' (boolean to int)
- converted sentiment frequency count into frequency ratio
- replaced NaN from sentiment ratio with 0

# Replace boolean to numerical value for 'has_photo'

In [4]:
reviews['has_photo'] = reviews['has_photo'].astype(int)

# Delete some

In [5]:
del reviews['Sfreq5']
del reviews['review']

# Remove Sfreq0-4

In [6]:
reviews.drop(columns=['Sfreq0','Sfreq1','Sfreq2','Sfreq3','Sfreq4'], inplace=True)

# Remove NaNs
Missing values are indicated with -1

In [7]:
print('len before:', len(reviews))
reviews = reviews.replace(-1, np.nan).dropna()
print('len after :', len(reviews))

len before: 257587
len after : 252508


## <span style="color:blue"> New Features
- 'extream' : 
 - 1 for rating either 1 or 5,   
 - 2 for the rest rating values
- 'storeReviewTot' : total number of sampled reviews for each restaurant
- 'storeReviewRatio' : For each restaurant review, the ratio of not-recommended/regular reviews

### 'extream' feature where rating == 1|5

In [8]:
reviews['extream'] = (reviews['rating'].isin([1,5])).astype(int)
reviews.loc[45:50, ['rating','extream']]

Unnamed: 0,rating,extream
45,5.0,1
46,4.0,0
47,4.0,0
48,5.0,1
49,5.0,1
50,1.0,1


### 'storeReviewTot' feature total number of reviews for each restaurantID is counted

In [9]:
restaurants = pd.read_csv('data/restaurants.csv').set_index('Unnamed: 0')
reviews = reviews.set_index('restaurant_id').join(restaurants[['n_reviews', 'rating']], rsuffix='_restaurant')

# Ratio

In [10]:
regularMeanByID = reviews.groupby(reviews.index)['regular'].mean()
df_regularMeanByID = pd.DataFrame({'restaurant_id': regularMeanByID.index, 'regularRatio': regularMeanByID})
reviews['restaurant_id'] =  reviews.index
reviews = reviews.merge(df_regularMeanByID, left_on = 'restaurant_id', right_on = 'restaurant_id', how = 'outer')
del reviews['restaurant_id']

# Write result

In [11]:
reviews.to_csv('data/reviews.csv', index=False)