###### 0. Read Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import os
import random
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from math import ceil
from itertools import product
%matplotlib notebook

###### 1. Setting global variables

In [2]:
FIT_SWITCH = True
DATA_FOLDER = '/Users/Wei.Zhao/Documents/Python code/tripadvisor/'
MODEL_FOLDER = '/Users/Wei.Zhao/Documents/Python code/tripadvisor/models/SGD'
PARK_ASSIGNMENT_FILE ='park_assignment.csv'
CLEAN_PARK_INFO_FILE = 'parks_info_final.csv'
CLEAN_REVIEW_FILE = 'reviews_clean.csv'

###### 2. Read Data from saved cleaned dataframes.

In [3]:
park_info_df=pd.read_csv(os.path.join(DATA_FOLDER,CLEAN_PARK_INFO_FILE))
park_assignment_df = pd.read_csv(os.path.join(DATA_FOLDER, PARK_ASSIGNMENT_FILE), index_col=0)
review_df = pd.read_csv(os.path.join(DATA_FOLDER, CLEAN_REVIEW_FILE), index_col=0)
review_df = pd.merge(review_df,park_assignment_df[['park_id','cluster_assignment']],on='park_id')
review_df.head(5)

Unnamed: 0,park_id,review_index,reviewer,reviewer_level,date,stars,title,comments,cluster_assignment
0,759,0,podrozniczka60,6,2015-09-26,3,helpful with your utah vacation planning,helpful with your utah vacation planning uta...,0
1,759,1,prpatel007,1,2013-10-14,5,shame on us americans,shame on us americans recently i and my wife...,0
2,759,2,Flybob6334580,5,2013-10-10,3,beautiful but crowded,beautiful but crowded zion is beautiful but ...,0
3,759,3,JaniceWriterAuthor,4,2013-09-19,5,wow zion is beautiful,wow zion is beautiful this was our first vis...,0
4,759,4,Beachgal003,6,2013-08-29,5,zion national park does it right,zion national park does it right first the s...,0


###### 3. Classification of parks based on every comment using park_id as label

In [4]:
'''separate parks by cluster assignment label'''
def subset_group(cluster_num):   
    df = review_df[review_df['cluster_assignment']==cluster_num]
    return df

In [5]:
'''SGDClassifer'''
def pipeline_build(park_weight):
    pipeline = Pipeline([('vect', TfidfVectorizer(strip_accents='unicode',
                                                  max_df = 0.6,
                                                  token_pattern = r'\b\w+\b',
                                                  ngram_range=(1,1),
                                                  min_df=2,
                                                  stop_words='english')),
                         ('clf', SGDClassifier(penalty='l2',class_weight = park_weight,
                                               fit_intercept=True,learning_rate='optimal',
                                               shuffle=True,n_iter=500,
                                               random_state=0))])
    param_grid = {'clf__alpha':[1e-5,1e-4],
                  'clf__loss':['hinge','log']
                  }


    search_grid = GridSearchCV(estimator=pipeline, 
                               param_grid=param_grid, 
                               verbose=1,n_jobs=-1,
                               refit=True)
    return search_grid



In [6]:
'''get top 20 words with highest coefficient for classification'''
def park_features_extract(best_model):
    class_label = best_model.named_steps['clf'].classes_
    for i,j in enumerate(class_label):
        feature_map = best_model.named_steps['vect'].get_feature_names()
        wt = best_model.named_steps['clf'].coef_[i]
        feature_scores = sorted(zip(feature_map, wt),key=lambda x:x[1],reverse=True)[0:20]
        feature_words=[k[0] for k in feature_scores[0:20]]
        park_features[j]=feature_words
    return park_features

In [7]:
'''funtions for saving the fitted model or load in fitted model from local''' 
def save_best_model(CV_search_grid,group_id):
     
    model_file_name = 'SGD_group_model'+'_'+str(group_id)+'.pkl'
    grid_file_name = 'SGD_group_grid'+'_'+str(group_id)+'.pkl'
    joblib.dump(CV_search_grid.best_estimator_, os.path.join(MODEL_FOLDER, model_file_name))
    joblib.dump(CV_search_grid.grid_scores_,os.path.join(MODEL_FOLDER, grid_file_name))
    
def load_best_model(file_path, group_id):
    model_file_name = 'SGD_group_model'+'_'+str(group_id)+'.pkl'
    grid_file_name = 'SGD_group_grid'+'_'+str(group_id)+'.pkl'
    best_model = joblib.load(os.path.join(MODEL_FOLDER, model_file_name))
    grid_score=joblib.load(os.path.join(MODEL_FOLDER,grid_file_name))
    return best_model,grid_score

In [8]:
total_clusters = 15
correct_rate = {}
park_features = {}

if FIT_SWITCH:
    for i in range(total_clusters):
        subset_df = subset_group(i)
        if i!=2: #cluster 2 has too many classes to efficiently implement classification algorithm
            park_reviews = subset_df.groupby('park_id', as_index=False).size()
            '''pre calculate class weight by park frequency in each cluster '''
            park_weight = (park_reviews/float(park_reviews.sum())).to_dict()
            '''splitting into train and test dataframe'''
            train_f, test_f, train_parkid, test_parkid = train_test_split(subset_df['comments'], 
                                                                          subset_df['park_id'],
                                                                          test_size=0.2,
                                                                          random_state=0)
      
            search_grid = pipeline_build(park_weight)
            search_grid.fit(train_f,train_parkid)
            best_estimator=search_grid.best_estimator_
            grid_scores=search_grid.grid_scores_
            '''saving fitted model for each cluster'''
            save_best_model(search_grid,group_id=i)         
            test_results = best_estimator.predict(test_f)
            '''calculate classification correct rate'''
            correct_rate[i] = np.mean(test_results == test_parkid)
            '''get top 20 words with highest coefficient for classification'''
            park_features = park_features_extract(best_estimator)
        else:
            continue

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 11.6min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.8min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.8min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.9min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.1min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.1min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.5min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.7min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.3min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 70.6min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  5.1min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.0min finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   49.2s finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.7min finished


In [9]:
classification_rate_df = pd.DataFrame(correct_rate.items(),columns=['cluster_assignment','correct_rate'])
classification_rate_df.to_csv('/Users/Wei.Zhao/Documents/Python code/tripadvisor/class_error.csv')
if FIT_SWITCH == True:
    feature_df = pd.DataFrame(park_features.items(),columns=['park_id','features'] )
    feature_df.to_csv('/Users/Wei.Zhao/Documents/Python code/tripadvisor/park_features.csv')
else:
    feature_df = pd.read_csv('/Users/Wei.Zhao/Documents/Python code/tripadvisor/park_features.csv',index_col=0)[['park_id','features']]
    classitication_rate_df=pd.read_csv('/Users/Wei.Zhao/Documents/Python code/tripadvisor/class_error.csv',index_col=0)

In [10]:
cluster_df = (review_df[['cluster_assignment','park_id']].drop_duplicates()
              .groupby('cluster_assignment',as_index=False)
              .count())
cluster_df.rename(columns={'park_id':'cluster_size'},inplace=True)
pd.merge(classification_rate_df,cluster_df,on='cluster_assignment')

Unnamed: 0,cluster_assignment,correct_rate,cluster_size
0,0,0.489641,67
1,1,0.828521,71
2,3,0.67958,30
3,4,0.689094,41
4,5,0.610309,42
5,6,0.728801,38
6,7,0.668216,43
7,8,0.708766,29
8,9,0.734301,32
9,10,0.6984,83


In [32]:
park_info_assign_df = pd.merge(park_info_df,park_assignment_df, on='park_id')[['park_id','name','park_features']]
classification_summary_df = pd.merge(park_info_assign_df,feature_df,on='park_id' )
pd.set_option('display.max_colwidth',-1)
classification_summary_df.rename(columns={'park_features': 'kmeans_tfidf_top_features', 'features': 'classfication features'}, inplace=True)
classification_summary_df

Unnamed: 0,park_id,name,kmeans_tfidf_top_features,classfication features
0,2.0,Hawaii Volcanoes National Park,"[u'lava', u'volcano', u'crater', u'volcanoes', u'kilauea', u'steam', u'vents', u'tube', u'craters', u'island', u'hawaii', u'glow', u'jagger', u'caldera', u'jaggar', u'iki', u'thurston', u'kona', u'active', u'tubes']","[sulfur, park, kileaua, rd, petroglyphs, steam, ocean, 4000, house, volunteer, museums, coast, holei, chain, jagger39s, plan, iki, till, days, restaurant]"
1,3.0,Dry Tortugas National Park,"[u'tortugas', u'snorkeling', u'fort', u'seaplane', u'ferry', u'boat', u'yankee', u'jefferson', u'snorkel', u'key', u'island', u'freedom', u'plane', u'breakfast', u'sea', u'fish', u'beach', u'crew', u'keys', u'iii']","[snorkeling, dr, snorkeled, snorkel, seaplane, overnight, mudd, 170, swim, jellyfish, sea, torugas, np, fishes, ferry, clipper, cat, middle, reef, park]"
2,4.0,Bryce Canyon,"[u'bryce', u'canyon', u'hoodoos', u'zion', u'rim', u'navajo', u'formations', u'utah', u'sunrise', u'shuttle', u'queens', u'queen39s', u'garden', u'rainbow', u'canyons', u'inspiration', u'peekaboo', u'hoodoo', u'viewpoints', u'fairyland']","[queens, hoodoos, navajoqueens, 8000, quothoodoosquot, fairytale, fairyland, sculptures, prairie, pronghorn, fins, paria, 39hoodoos39, pillars, inspiration, navaho, spike, bryce, 63, terracotta]"
3,5.0,Balboa Park,"[u'balboa', u'museums', u'diego', u'san', u'zoo', u'gardens', u'architecture', u'garden', u'organ', u'prado', u'botanical', u'art', u'buildings', u'japanese', u'spanish', u'science', u'rose', u'sd', u'restaurants', u'globe']","[balboa, organ, prado, museums, fleet, friendship, cottages, architecture, globe, man, village, spanish, imax, aerospace, timken, buildings, tuesdays, international, automotive, theater]"
4,6.0,Arches National Park,"[u'arches', u'arch', u'delicate', u'moab', u'formations', u'utah', u'furnace', u'fiery', u'windows', u'double', u'garden', u'canyonlands', u'devil39s', u'devils', u'sunrise', u'balanced', u'avenue', u'dune', u'sand', u'canyon']","[delicate, hikes, garden, arches, windows, words, traffic, park, walks, earth, window, days, parks, quotwindowsquot, absolute, concentration, eyes, planet, cd, 2000]"
5,8.0,Grand Canyon National Park,"[u'canyon', u'rim', u'vegas', u'helicopter', u'las', u'shuttle', u'angel', u'justice', u'bucket', u'buses', u'sunrise', u'gc', u'skywalk', u'bright', u'mather', u'flagstaff', u'hoover', u'kaibab', u'tovar', u'lodge']","[justiceplease, canyonwas, inlaw, inspiringi, peoplejust, fulfill, confronts, beholding, logde, volunteering, marketplace, beliefs, funfilled, luke, throws, colter39s, comercialised, 7000ft, godo, unspeakably]"
6,9.0,Mount Desert Island,"[u'acadia', u'harbor', u'island', u'cadillac', u'bar', u'desert', u'maine', u'mdi', u'mount', u'carriage', u'jordan', u'ocean', u'pond', u'lobster', u'mt', u'restaurants', u'biking', u'towns', u'coast', u'harbors']","[island, mdi, restaurants, shopping, harbors, fjord, stores, towns, outdoorsy, seal, harbours, touristy, quaint, southwest, june, adventures, dislike, wilderness, town, lakes]"
7,10.0,Acadia National Park,"[u'acadia', u'cadillac', u'jordan', u'carriage', u'harbor', u'pond', u'beach', u'bar', u'ocean', u'thunder', u'maine', u'hole', u'sand', u'popovers', u'sunrise', u'biking', u'jordan39s', u'coast', u'island', u'mt']","[100th, park, parks, celebrating, midoctober, specifically, significant, proud, we39ve, beginners, treasure, parkthe, birthday, hiker, roadway, raining, rode, inches, maintenance, tremendous]"
8,11.0,Hersheypark,"[u'hershey', u'rides', u'coasters', u'hersheypark', u'roller', u'amusement', u'chocolate', u'lines', u'coaster', u'waterpark', u'classpartialentryspan', u'tickets', u'theme', u'candylane', u'zoo', u'disney', u'christmas', u'boardwalk', u'kiddie', u'rollercoasters']","[hershey, hersheypark, chocolate, candylane, preview, sweetest, skyrush, zooamerica, hershey39s, boardwalk, factory, hersheyparkspan, springtime, fahrenheit, hersey, classpartialentryspan, runner, sweet, lodge, manager]"
9,12.0,Yosemite Valley,"[u'yosemite', u'falls', u'waterfalls', u'capitan', u'dome', u'el', u'glacier', u'shuttle', u'granite', u'curry', u'village', u'floor', u'merced', u'mariposa', u'meadows', u'tioga', u'vernal', u'san', u'francisco', u'tunnel']","[bikes, valley, merced, bike, mirror, falls, cliffs, biking, waterfalls, april, stayed, curry, pines, paradise, shuttles, fresno, photographer, spring, visited, lodging]"


In [None]:
feature