### Import Necessary Packages

In [40]:
import pandas as pd
import pandasql as ps
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import json


from pandas.io.json import json_normalize
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re 

%matplotlib inline
plt.style.use('fivethirtyeight')
plt.style.use('bmh')

### Load Datasets

In [41]:
features_df = pd.read_csv("datasets/odyssey_restaurants.csv")
reviews_df = pd.read_csv("datasets/odyssey_sentiment.csv")
sentiment_df = pd.read_csv("datasets/odyssey_sentiment_grouped.csv")

  interactivity=interactivity, compiler=compiler, result=result)


### Manipulate Datasets Accordingly

In [42]:
## Clean Out Data

del(features_df['Unnamed: 0'])

reviews_df = reviews_df[reviews_df['stars'] != 'stars']
reviews_df['stars'] = reviews_df['stars'].map({'5': 5, '4': 4, '3': 3, '2': 2, '1': 1, 5: 5, 4: 4, 3: 3, 2: 2, 1: 1})

In [43]:
## Pivot Sentiment Data

sentiment_df = sentiment_df.pivot(index='business_id', columns='label', values='size')
sentiment_df = sentiment_df.rename_axis(None, axis=1).reset_index() 
sentiment_df.columns = ['business_id', 'negative_reviews', 'neutral_reviews', 'positive_reviews']

In [44]:
## Average out Actual Star-Ratings per business

reviews_df = reviews_df[['business_id', 'stars']]
reviews_df = reviews_df.groupby(['business_id']).mean().reset_index()

In [45]:
## Drop categorised Star Ratings

features_df = features_df.drop(columns=['stars'])

In [47]:
## Dummy Variables for City_Cuisine

features_df['cat'] = features_df['city']+ "_" +features_df['cuisine']+ "_" +features_df['RestaurantsPriceRange2'].map(str)

features_df = pd.concat([features_df,pd.get_dummies(features_df['cat'], prefix='cat')],axis=1)

In [48]:
## Merge new columns into Features Dataset

features_df = pd.merge(features_df, sentiment_df, how='left', on='business_id')
features_df = pd.merge(features_df, reviews_df, on='business_id', how='left')

### Data Checking

In [49]:
features_df[features_df.business_id == 'jg37O7ANF7hqggS6bxUpcQ']

Unnamed: 0,address,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,...,cat_Toronto_chinese_nan,cat_Toronto_italian_1.0,cat_Toronto_italian_2.0,cat_Toronto_italian_3.0,cat_Toronto_italian_4.0,cat_Toronto_italian_nan,negative_reviews,neutral_reviews,positive_reviews,stars
4555,6630 4 St NE,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,False,,...,0,0,0,0,0,0,5.0,9.0,31.0,3.711111


In [50]:
reviews_df[reviews_df.business_id == 'jg37O7ANF7hqggS6bxUpcQ']

Unnamed: 0,business_id,stars
3217,jg37O7ANF7hqggS6bxUpcQ,3.711111


In [51]:
list(features_df)

['address',
 'AcceptsInsurance',
 'AgesAllowed',
 'Alcohol',
 'Ambience',
 'BYOB',
 'BYOBCorkage',
 'BestNights',
 'BikeParking',
 'BusinessAcceptsBitcoin',
 'BusinessAcceptsCreditCards',
 'BusinessParking',
 'ByAppointmentOnly',
 'Caters',
 'CoatCheck',
 'Corkage',
 'DietaryRestrictions',
 'DogsAllowed',
 'DriveThru',
 'GoodForDancing',
 'GoodForKids',
 'GoodForMeal',
 'HairSpecializesIn',
 'HappyHour',
 'HasTV',
 'Music',
 'NoiseLevel',
 'Open24Hours',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsCounterService',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'Smoking',
 'WheelchairAccessible',
 'WiFi',
 'business_id',
 'categories',
 'city',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'state',
 'cuisine',
 'cat',
 'cat_Calgar

### Save Updated Business Features Dataset

In [52]:
features_df.to_csv('datasets/odyssey_features.csv', index = False)