### Import Necessary Packages

In [15]:
import pandas as pd
import pandasql as ps
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import json


from pandas.io.json import json_normalize
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re 

%matplotlib inline
plt.style.use('fivethirtyeight')
plt.style.use('bmh')

### Load Datasets

In [32]:
features_df = pd.read_csv("datasets/odyssey_restaurants.csv")
reviews_df = pd.read_csv("datasets/odyssey_sentiment.csv")
sentiment_df = pd.read_csv("datasets/odyssey_sentiment_grouped.csv")

### Manipulate Datasets Accordingly

In [33]:
## Clean Out Data

del(features_df['Unnamed: 0'])

reviews_df = reviews_df[reviews_df['stars'] != 'stars']
reviews_df['stars'] = reviews_df['stars'].map({'5': 5, '4': 4, '3': 3, '2': 2, '1': 1, 5: 5, 4: 4, 3: 3, 2: 2, 1: 1})

In [18]:
## Pivot Sentiment Data

sentiment_df = sentiment_df.pivot(index='business_id', columns='label', values='size')
sentiment_df = sentiment_df.rename_axis(None, axis=1).reset_index() 
sentiment_df.columns = ['business_id', 'negative_reviews', 'neutral_reviews', 'positive_reviews']

In [19]:
## Average out Actual Star-Ratings per business

reviews_df = reviews_df[['business_id', 'stars']]
reviews_df = reviews_df.groupby(['business_id']).mean().reset_index()

In [34]:
## Drop categorised Star Ratings

features_df = features_df.drop(columns=['stars'])

In [35]:
## Dummy Variables for City_Cuisine

features_df['cc'] = features_df['city']+ "_" + features_df['cuisine'].map(str)

features_df = pd.concat([features_df,pd.get_dummies(features_df['cc'], prefix='cc')],axis=1)

In [36]:
## Merge new columns into Features Dataset

features_df = pd.merge(features_df, sentiment_df, how='left', on='business_id')
features_df = pd.merge(features_df, reviews_df, on='business_id', how='left')

### Data Checking

In [37]:
features_df[features_df.business_id == 'jg37O7ANF7hqggS6bxUpcQ']

Unnamed: 0,address,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,...,cc_Las Vegas_american,cc_Las Vegas_chinese,cc_Las Vegas_italian,cc_Toronto_american,cc_Toronto_chinese,cc_Toronto_italian,negative_reviews,neutral_reviews,positive_reviews,stars
4555,6630 4 St NE,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,False,,...,0,0,0,0,0,0,5.0,9.0,31.0,3.711111


In [38]:
reviews_df[reviews_df.business_id == 'jg37O7ANF7hqggS6bxUpcQ']

Unnamed: 0,business_id,stars
3217,jg37O7ANF7hqggS6bxUpcQ,3.711111


### Save Updated Business Features Dataset

In [39]:
features_df.to_csv('datasets/odyssey_features.csv', index = False)