###### 0. Loading libraries

In [1]:
%matplotlib notebook
import requests
import os
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from string import punctuation
import pandas as pd
import numpy as np
import re
from IPython.display import Image
from IPython.core.display import HTML

###### 1. Setting global variables

In [2]:
CRAWLER_SWITCH = False
S3_IO = False
ACCESS_KEY = ""
SECRET_KEY = "/"
BUCKET_NAME = 'parks101'
READ_SWITCH = True
WRITE_SWITCH = False
FIT_SWITCH = True
DATA_FOLDER = '/Users/Wei.Zhao/Documents/Python code/tripadvisor/'
MODEL_FOLDER = '/Users/Wei.Zhao/Documents/Python code/tripadvisor/models/'
PARK_INFO_FILE = 'parks_info.csv'
REVIEW_RAW_FILE = 'reviews_raw.csv'
CLEAN_PARK_INFO_FILE = 'parks_info_clean.csv'
CLEAN_REVIEW_FILE = 'reviews_clean.csv'

###### 2. Read Data

In [3]:
review_df_columns = ['index', 'comments', 'date', 'park_id', 'stars', 'title', 'reviewer_level', 'reviewer']
review_df_dtypes = {'index': np.int32, 
                    'comments': str, 
                    'date': str, 
                    'park_id': np.int32, 
                    'stars': np.int32, 
                    'title': str, 
                    'reviewer_level': np.int32, 
                    'reviewer': str}
if S3_IO:
    s3_handle = S3Connection(ACCESS_KEY, SECRET_KEY)
    bucket_handle = s3_handle.get_bucket(BUCKET_NAME)
    file_handle = Key(bucket_handle)
    file_handle.key = PARK_INFO_FILE
    file_handle.open()
    parks_info_df = pd.read_csv(file_handle, index_col=0)
    file_handle.close()
    file_handle.key = REVIEW_RAW_FILE
    file_handle.open()
    review_raw_df = pd.read_csv(file_handle, index_col=0, header=0)
    file_handle.close()
else:
    parks_info_df = pd.read_csv(os.path.join(DATA_FOLDER, PARK_INFO_FILE), index_col=0)
    review_raw_df = pd.read_csv(os.path.join(DATA_FOLDER, REVIEW_RAW_FILE), names=review_df_columns)

* 2.1 *(optional)* to save date to S3

In [None]:
if S3_IO and WRITE_SWITCH:
    s3_handle = S3Connection(ACCESS_KEY, SECRET_KEY)
    bucket_handle = s3_handle.get_bucket(BUCKET_NAME)
    for onefile, df in zip((PARK_INFO_FILE, REVIEW_RAW_FILE), (parks_info_df, review_raw_df)):
        if onefile in [i.name for i in bucket_handle.list()]:
            file_exist = Key(bucket_handle)
            file_exist.key = onefile
            bucket_handle.delete_key(file_exist)
        new_file_handle = bucket_handle.new_key(onefile)
        new_file_handle.set_contents_from_string(df.to_csv())

* 2.2 Clean up park_info dataframe

    + 2.2.1 Remove duplicate
    + 2.2.2 Convert visit counts to integer
    + 2.2.3 Count total visit numbers

In [13]:
print parks_info_df.count()['park_id']
parks_info_df.drop_duplicates(inplace=True)
print parks_info_df.count()['park_id']

column_names = parks_info_df.columns.values.tolist()[5:19]
for col in column_names:
    parks_info_df[col] = parks_info_df[col].apply(int)
    
def col_agg(x, inputCol, *weights):
    data_type = x[inputCol[0]]
    if not weights:
        if isinstance(data_type, (int, float, long)):
            return sum(x[i] for i in inputCol)
        elif isinstance(data_type, (str, object)):
            return ' '.join([x[i] for i in inputCol])
        return None
    else:
        if isinstance(data_type, (int, float, long)):
            return sum(x[i] * w for i, w in zip(inputCol, weights[0]))
        elif isinstance(data_type, (str, object)):
            return ' '.join([' '.join([x[i], ' ']) * w for i, w in zip(inputCol, weights[0])])
        return None

parks_info_df['total visit'] = parks_info_df.apply(lambda x: col_agg(x, ['Spring', 'Summer', 'Fall', 'Winter']), axis=1)

1025
1025


In [21]:
parks_info_df.head(6)

Unnamed: 0,park_id,html,name,city,state,Excellent,Very_good,Average,Poor,Terrible,Families,Couples,Friends,Solo,Business,Spring,Summer,Fall,Winter,total visit
0,0.0,https://www.tripadvisor.com/Attraction_Review-...,Central_Park,New_York_City_New,York,35444,9112,1380,132,52,12377,15311,7739,2824,1144,11247,14390,11419,9064,46120
1,1.0,https://www.tripadvisor.com/Attraction_Review-...,Hot_Springs_National_Park,Hot_Springs,Arkansas,210,99,42,13,5,95,151,46,15,6,104,143,78,44,369
2,2.0,https://www.tripadvisor.com/Attraction_Review-...,Hawaii_Volcanoes_National_Park,Hawaii_Volcanoes_National_Park_Island_of_Hawaii,Ha,3424,768,136,25,13,1168,1875,481,161,19,1150,1220,860,1136,4366
3,3.0,https://www.tripadvisor.com/Attraction_Review-...,Dry_Tortugas_National_Park,Key_West_Florida_Keys,Florida,2478,423,89,20,14,694,1561,478,112,17,825,932,575,692,3024
4,4.0,https://www.tripadvisor.com/Attraction_Review-...,Bryce_Canyon,Bryce_Canyon_National_Park,Utah,3329,293,31,6,2,815,1636,469,144,8,783,1473,1100,305,3661
5,5.0,https://www.tripadvisor.com/Attraction_Review-...,Balboa_Park,San_Diego,California,4898,1349,201,39,12,1652,2004,899,430,259,1931,1929,1271,1368,6499


* 2.3 Clean up review_raw dataframe
    + 2.3.1 Remove duplicates
    + 2.3.2 Convert counts into integers
    + 2.3.3 Convert date string into datestamp
    + 2.3.4 Remove punctuates from comment strings
    + 2.3.5 Assign a weight to title string and combine with comments

In [6]:
print review_raw_df.count()['park_id']
review_raw_df.drop_duplicates(subset=['park_id', 'comments', 'reviewer'], inplace=True)
print review_raw_df.count()['park_id']

1297806
1297575


In [7]:
def float2int(anum):
    if isinstance(anum, float):
        return int(anum)
    else:
        return anum

def str2date(astr):  
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    date_s = re.search('(January|February|March|April|May|June|July|August|September|October|November|December).*?(\d{1,2}).*?(\d{4})', astr)
    m, d, y = date_s.group(1), date_s.group(2), date_s.group(3)                  
    return pd.Timestamp('-'.join([y, str(months.index(m) + 1), d]))

def remove_puncs(astr):
    astr = re.sub('div.*div','',astr)
    for s in punctuation:
        astr = astr.replace(s, '')
    return astr.lower() 

In [11]:
review_clean_df=[]

In [12]:
cols = ['park_id', 'index', 'reviewer', 'reviewer_level', 'date', 'stars', 'title', 'comments']
newcols = ['park_id', 'review_index', 'reviewer', 'reviewer_level', 'date', 'stars', 'title', 'comments']
funs = [float2int, float2int, None, float2int, str2date, float2int, remove_puncs, remove_puncs]
review_clean_df = pd.DataFrame()
for i, col in enumerate(cols):
    if funs[i]:
        review_clean_df[newcols[i]] = review_raw_df[col].apply(funs[i])
    else:
        review_clean_df[newcols[i]] = review_raw_df[col]

In [14]:
title_weight = 1
review_clean_df['comments'] = review_clean_df.apply(lambda x: col_agg(x, 
                                                                      ['title', 'comments'], 
                                                                      (title_weight, 1)), 
                                                    axis=1)

In [15]:
review_clean_df.iloc[0]['comments']

'helpful with your utah vacation planning   utah is arguably one of the most scenic and geologically interesting states in usa to see all the southern utah attractions one can obtain information and maps from the utah travel and tourism sites or information centers utah has also 45 state parks  east zion tourism council provides maps weather information and helpful advice for visiting zion national park  '

###### 3. Save cleaned dataframe

In [16]:
parks_info_df.to_csv(os.path.join(DATA_FOLDER, CLEAN_PARK_INFO_FILE), header=True)
review_clean_df.to_csv(os.path.join(DATA_FOLDER, CLEAN_REVIEW_FILE), header=True)