# Data Wrangling

In [None]:
import pandas as pd
import numpy as np

## 1. Datasets Loading

In [None]:
def load_data(filename, platform):
    data = pd.read_csv(filename)
    data['platform'] = [platform]*(data.shape[0])
    return data

# load in location information data
tplaces = pd.read_csv('SF_places.csv')
gplaces = pd.read_csv('new_google_places.csv')
yplaces = pd.read_csv('yelp_business.csv')

# load in reviews data
treviews = load_data('tripadvisor_reviews.csv', platform='TripAdvisor')
greviews = load_data('new_google_reviews.csv', platform='Google')
yreviews = load_data('yelp_reviews.csv', platform='Yelp')

## 2. Examination

### 2.1 TripAdvisor & Google Data

In [None]:
print(treviews.info())
print()
print(greviews.info())

### 2.2 Yelp Data

In [None]:
yreviews.info()

In [None]:
yplaces.info()

#### 2.2.1 Merge business and reviews datasets

In [None]:
ymerge = pd.merge(yreviews, yplaces, left_on='business_id', right_on='id')
ymerge.head(5)

In [None]:
y_keep_columns = ['platform','text','rating_x', 'time_created', 'name', 'coordinates']
yreviews_new = ymerge.loc[:, y_keep_columns]
yreviews_new = yreviews_new.rename(columns={'name': 'attraction', 
                            'text': 'review',
                            'rating_x': 'rating',
                            'time_created': 'date'
                            })
yreviews_new.head(5)

## 3. Mergeing

In [None]:
merge1 = pd.merge(treviews,greviews, how='outer')
all_reviews = pd.merge(merge1,yreviews_new, how='outer')
all_reviews.head(3)

In [None]:
all_reviews['time2'] = all_reviews['month'] + ' ' + all_reviews['year'].astype(str)
all_reviews['time2'].fillna('Jan 1900', inplace=True)
all_reviews.loc[pd.to_datetime(all_reviews['time2'], format='%b %Y', errors='coerce').isnull(), 'time2'] = 'Jan 1900'
all_reviews['time2'] = pd.to_datetime(all_reviews['time2'], format='%b %Y').dt.strftime('%b, %Y')
all_reviews['time_merged'] = all_reviews['time2'].combine_first(all_reviews['time'])

all_reviews['time3'] = pd.to_datetime(all_reviews['date']).dt.strftime('%b, %Y')
all_reviews['time_merged2'] = all_reviews['time_merged'].combine_first(all_reviews['time3'])

In [None]:
all_reviews = all_reviews.rename(columns={'time_merged2': 'datecolumn' })
all_keep_columns = ['platform','attraction','review', 'rating','datecolumn', 'coordinates']
all_reviews = all_reviews.loc[:, all_keep_columns]
all_reviews['datecolumn'] = all_reviews['datecolumn'].replace('Jan, 1900', np.nan)
all_reviews.head(5)