In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import string
import json
pd.options.mode.chained_assignment = None  # default='warn'
from merge_main import open_pickle, save_to_pickle
from import_yelp_mongo import get_yelp_reviews, get_yelp_reviews_afterdate

%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score,\
                            confusion_matrix, classification_report, mean_squared_error
from sklearn.grid_search import GridSearchCV
import yelp_tfidf as lib_tfidf
import model_phoenix as lib



In [2]:
from IPython.display import display

In [20]:
reload(lib)

<module 'model_phoenix' from 'model_phoenix.py'>

# Model, Create X, y:

# Phoenix, AZ -- Classification:

In [8]:
%%time
df_AZ, tfs, tfs_vocab, tfs_h, tfs_h_vocab, A_labels, A_vocab = lib.unpack_data('../data/phx/model_data_phx.pkl')

col1 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var']
col2 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var','n_hygiene','n_service',
        'n_location','n_food','n_premise','n_quality','n_value']

HM = lib.HealthModel(df_AZ, tfs, tfs_h, tfs_vocab=tfs_vocab, tfs_h_vocab=tfs_h_vocab)

CPU times: user 21.2 s, sys: 3.68 s, total: 24.9 s
Wall time: 27.9 s


In [9]:
df_AZ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 0 to 23486
Data columns (total 31 columns):
business_id       23487 non-null object
id_               23487 non-null object
date              23487 non-null object
inspec_id         23487 non-null object
n_priority        23487 non-null object
grade             23487 non-null object
purpose           23487 non-null object
n_violations      16427 non-null float64
v_core            23487 non-null int64
sum_core          23487 non-null int64
v_foundation      23487 non-null int64
sum_foundation    23487 non-null int64
v_priority        23487 non-null int64
sum_priority      23487 non-null int64
date_start        23487 non-null datetime64[ns]
rev_ct            23487 non-null int64
neg_ct            23487 non-null int64
stars_avg         23487 non-null float64
rev_len_avg       23487 non-null float64
stars_var         23487 non-null float64
text              23487 non-null object
aspects           23487 non-null object
sentenc

In [12]:
model_rfc = RandomForestClassifier(oob_score=True, 
                                   random_state = 981, 
                                   class_weight='balanced',
                                   n_jobs=-1)
model_log = LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=981)
model_svc = LinearSVC(C=0.19, random_state = 981, class_weight='balanced')

models_unoptimized_c = [model_rfc, model_log, model_svc]

In [11]:
model_rfr = RandomForestRegressor(oob_score=True, 
                                  random_state = 981,
                                  max_features='sqrt',
                                  n_jobs=-1)
model_lin = LinearRegression(n_jobs=-1)
model_svr = LinearSVR(C=0.19, random_state = 981)

## RELOAD CODE

In [21]:
reload(lib)

HM = lib.HealthModel(df_AZ, tfs, tfs_h, tfs_vocab=tfs_vocab, tfs_h_vocab=tfs_h_vocab)

### Models Using Original Features

#### Un-optimized

In [22]:
%%time
models = [model_rfc, model_log, model_svc]

# Unoptimized, original features
display(HM.compare_models(models, ['unoptimized' for _ in models], ['Original Features' for _ in models], col=col1))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier,unoptimized,Original Features
2,944,455,5567,81,0.801476,0.10378,0.198524,0.151119,0.079024,RandomForestClassifier,unoptimized,Original Features
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier,unoptimized,Original Features
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier,unoptimized,Original Features
5,23,5,7019,0,0.996027,0.0,0.003973,0.0,0.0,RandomForestClassifier,unoptimized,Original Features
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,unoptimized,Original Features
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression,unoptimized,Original Features
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression,unoptimized,Original Features
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression,unoptimized,Original Features
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression,unoptimized,Original Features


CPU times: user 12.9 s, sys: 292 ms, total: 13.2 s
Wall time: 13.6 s


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [23]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 4,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
#           model_log, model_svc]

# optimized, original features -- optimized w/ scoring = precision
display(HM.compare_models(models, ['optimized - precison' for _ in models], ['Original Features' for _ in models],  col=col1))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1699,1513,2585,1250,0.544203,0.437675,0.455797,0.452407,0.423872,RandomForestClassifier,optimized - precison,Original Features
2,540,2468,3554,485,0.573152,0.243841,0.426848,0.16424,0.473171,RandomForestClassifier,optimized - precison,Original Features
3,166,2960,3753,168,0.556407,0.097054,0.443593,0.053708,0.502994,RandomForestClassifier,optimized - precison,Original Features
4,47,2485,4466,49,0.640698,0.037262,0.359302,0.019337,0.510417,RandomForestClassifier,optimized - precison,Original Features
5,15,1552,5472,8,0.777636,0.010107,0.222364,0.005128,0.347826,RandomForestClassifier,optimized - precison,Original Features
6,7,176,6864,0,0.974032,0.0,0.025968,0.0,0.0,RandomForestClassifier,optimized - precison,Original Features


CPU times: user 20.8 s, sys: 1.06 s, total: 21.8 s
Wall time: 11.8 s


In [24]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
#           model_log, model_svc]

# optimized, original features -- optimized w/ scoring = accuracy
display(HM.compare_models(models, ['optimized - accuracy' for _ in models], ['Original Features' for _ in models],  col=col1))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1531,1705,2393,1418,0.540798,0.467062,0.459202,0.454051,0.480841,RandomForestClassifier,optimized - accuracy,Original Features
2,727,1415,4607,298,0.696041,0.217677,0.303959,0.173964,0.290732,RandomForestClassifier,optimized - accuracy,Original Features
3,283,820,5893,51,0.843479,0.084647,0.156521,0.058553,0.152695,RandomForestClassifier,optimized - accuracy,Original Features
4,92,126,6825,4,0.969065,0.035398,0.030935,0.030769,0.041667,RandomForestClassifier,optimized - accuracy,Original Features
5,23,23,7001,0,0.993472,0.0,0.006528,0.0,0.0,RandomForestClassifier,optimized - accuracy,Original Features
6,7,2,7038,0,0.998723,0.0,0.001277,0.0,0.0,RandomForestClassifier,optimized - accuracy,Original Features


In [25]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'entropy',
                                     'max_depth': 2,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 100,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
#           model_log, model_svc]

# optimized, original features -- optimized w/ scoring = recall
display(HM.compare_models(models, ['optimized - recall' for _ in models], ['Original Features' for _ in models],  col=col1))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1942,1208,2890,1007,0.553001,0.390008,0.446999,0.454628,0.341472,RandomForestClassifier,optimized - recall,Original Features
2,504,2659,3363,521,0.551157,0.2478,0.448843,0.163836,0.508293,RandomForestClassifier,optimized - recall,Original Features
3,139,3419,3294,195,0.495104,0.098784,0.504896,0.053957,0.583832,RandomForestClassifier,optimized - recall,Original Features
4,35,3273,3678,61,0.53058,0.035569,0.46942,0.018296,0.635417,RandomForestClassifier,optimized - recall,Original Features
5,10,3209,3815,13,0.54321,0.008012,0.45679,0.004035,0.565217,RandomForestClassifier,optimized - recall,Original Features
6,6,878,6162,1,0.874557,0.002257,0.125443,0.001138,0.142857,RandomForestClassifier,optimized - recall,Original Features


In [26]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
#           model_log, model_svc]

# optimized, original features -- optimized w/ scoring = f1 score
display(HM.compare_models(models, ['optimized - f1' for _ in models], ['Original Features' for _ in models],  col=col1))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1943,1206,2892,1006,0.553143,0.389847,0.446857,0.454792,0.341133,RandomForestClassifier,optimized - f1,Original Features
2,507,2629,3393,518,0.554988,0.248322,0.445012,0.164601,0.505366,RandomForestClassifier,optimized - f1,Original Features
3,138,3484,3229,196,0.486022,0.097658,0.513978,0.053261,0.586826,RandomForestClassifier,optimized - f1,Original Features
4,35,3262,3689,61,0.532141,0.035683,0.467859,0.018357,0.635417,RandomForestClassifier,optimized - f1,Original Features
5,10,2930,4094,13,0.582801,0.008766,0.417199,0.004417,0.565217,RandomForestClassifier,optimized - f1,Original Features
6,5,822,6218,2,0.882645,0.004813,0.117355,0.002427,0.285714,RandomForestClassifier,optimized - f1,Original Features


### Models using Original TFIDF feature matrix (5000 top words)

#### Unoptimized

In [27]:
%%time
models = [model_rfc, model_log, model_svc]

# Unoptimized, tfidf features
display(HM.compare_models(models, ['unoptimized' for _ in models], ['Original TFIDF' for _ in models], tfs=True))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2177,814,3284,772,0.575564,0.340463,0.424436,0.486759,0.261784,RandomForestClassifier,unoptimized,Original TFIDF
2,1005,41,5981,20,0.851568,0.036832,0.148432,0.327869,0.019512,RandomForestClassifier,unoptimized,Original TFIDF
3,333,10,6703,1,0.951327,0.005797,0.048673,0.090909,0.002994,RandomForestClassifier,unoptimized,Original TFIDF
4,95,3,6948,1,0.986093,0.02,0.013907,0.25,0.010417,RandomForestClassifier,unoptimized,Original TFIDF
5,23,2,7022,0,0.996452,0.0,0.003548,0.0,0.0,RandomForestClassifier,unoptimized,Original TFIDF
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,unoptimized,Original TFIDF
1,1330,1680,2418,1619,0.572868,0.518246,0.427132,0.490755,0.549,LogisticRegression,unoptimized,Original TFIDF
2,558,1800,4222,467,0.66539,0.283718,0.33461,0.205999,0.45561,LogisticRegression,unoptimized,Original TFIDF
3,231,1158,5555,103,0.802895,0.129154,0.197105,0.081681,0.308383,LogisticRegression,unoptimized,Original TFIDF
4,80,419,6532,16,0.92919,0.060264,0.07081,0.036782,0.166667,LogisticRegression,unoptimized,Original TFIDF


CPU times: user 1min 32s, sys: 32.2 s, total: 2min 4s
Wall time: 1min 37s


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [28]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 3,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981
                                   })]#, 
          #model_log, model_svc]

# Optimized, tfidf features -- precision
display(HM.compare_models(models, ['optimized - precision' for _ in models], ['Original TFIDF' for _ in models],
                          tfs=True))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1706,1257,2841,1243,0.579537,0.456231,0.420463,0.4972,0.421499,RandomForestClassifier,optimized - precision,Original TFIDF
2,929,301,5721,96,0.825458,0.135021,0.174542,0.241814,0.093659,RandomForestClassifier,optimized - precision,Original TFIDF
3,330,26,6687,4,0.949482,0.021978,0.050518,0.133333,0.011976,RandomForestClassifier,optimized - precision,Original TFIDF
4,95,3,6948,1,0.986093,0.02,0.013907,0.25,0.010417,RandomForestClassifier,optimized - precision,Original TFIDF
5,23,1,7023,0,0.996594,0.0,0.003406,0.0,0.0,RandomForestClassifier,optimized - precision,Original TFIDF
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,optimized - precision,Original TFIDF


CPU times: user 8min 43s, sys: 23.1 s, total: 9min 6s
Wall time: 3min 23s


In [29]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 3,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981
                                   })]#, 
          #model_log, model_svc]

# Optimized, tfidf features -- f1 score
display(HM.compare_models(models, ['optimized - f1' for _ in models], ['Original TFIDF' for _ in models],
                          tfs=True))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1588,1551,2547,1361,0.554562,0.464426,0.445438,0.467376,0.461512,RandomForestClassifier,optimized - f1,Original TFIDF
2,551,2140,3882,474,0.618135,0.260511,0.381865,0.181331,0.462439,RandomForestClassifier,optimized - f1,Original TFIDF
3,208,1777,4936,126,0.71832,0.112651,0.28168,0.066211,0.377246,RandomForestClassifier,optimized - f1,Original TFIDF
4,75,622,6329,21,0.901093,0.056834,0.098907,0.032659,0.21875,RandomForestClassifier,optimized - f1,Original TFIDF
5,23,10,7014,0,0.995317,0.0,0.004683,0.0,0.0,RandomForestClassifier,optimized - f1,Original TFIDF
6,7,0,7040,0,0.999007,0.0,0.000993,0.0,0.0,RandomForestClassifier,optimized - f1,Original TFIDF


CPU times: user 3min 10s, sys: 16.2 s, total: 3min 26s
Wall time: 1min 30s


## Topic Segmentation (Classify topic of each sentence)

### Models using Topic summary features:

#### Unoptimized

In [30]:
%%time
models = [model_rfc, model_log, model_svc]

# Unoptimized, tfidf features
display(HM.compare_models(models, ['unoptimized' for _ in models], ['Health TFIDF' for _ in models], tfs_h=True))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,type,features
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2641,343,3755,308,0.576557,0.171111,0.423443,0.473118,0.104442,RandomForestClassifier,unoptimized,Health TFIDF
2,1022,36,5986,3,0.849865,0.005639,0.150135,0.076923,0.002927,RandomForestClassifier,unoptimized,Health TFIDF
3,334,24,6689,0,0.949198,0.0,0.050802,0.0,0.0,RandomForestClassifier,unoptimized,Health TFIDF
4,96,12,6939,0,0.984674,0.0,0.015326,0.0,0.0,RandomForestClassifier,unoptimized,Health TFIDF
5,23,4,7020,0,0.996169,0.0,0.003831,0.0,0.0,RandomForestClassifier,unoptimized,Health TFIDF
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,unoptimized,Health TFIDF
1,2420,629,3469,529,0.567334,0.257609,0.432666,0.456822,0.179383,LogisticRegression,unoptimized,Health TFIDF
2,885,687,5335,140,0.776926,0.151188,0.223074,0.169287,0.136585,LogisticRegression,unoptimized,Health TFIDF
3,305,545,6168,29,0.879381,0.063877,0.120619,0.050523,0.086826,LogisticRegression,unoptimized,Health TFIDF
4,92,209,6742,4,0.957287,0.02589,0.042713,0.018779,0.041667,LogisticRegression,unoptimized,Health TFIDF


CPU times: user 4min 19s, sys: 27.2 s, total: 4min 46s
Wall time: 2min 11s


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [None]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'entropy',
                                     'max_depth': None,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
#           model_log, model_svc]

# Optimized, health tfidf features -- precision
display(HM.compare_models(models, ['optimized - precision' for _ in models], ['Health TFIDF' for _ in models], 
                          tfs_h=True))

In [None]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
#           model_log, model_svc]

# Optimized, health tfidf features -- accuracy
display(HM.compare_models(models, ['optimized - accuracy' for _ in models], ['Health TFIDF' for _ in models], 
                          tfs_h=True))

In [None]:
%%time
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 100,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
#           model_log, model_svc]

# Optimized, health tfidf features -- recall score
display(HM.compare_models(models, ['optimized - recall' for _ in models], ['Health TFIDF' for _ in models], 
                          tfs_h=True))

In [None]:
%%
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
#           model_log, model_svc]

# Optimized, health tfidf features -- f1 score
display(HM.compare_models(models, ['optimized - f1' for _ in models], ['Health TFIDF' for _ in models], 
                          tfs_h=True))

## GRID SEARCH FOR BEST PARAMETERS

### Random Forest Models:

In [52]:
RF = RandomForestClassifier()
param_grid_rfc = {
                'n_estimators':[100, 200],
                'criterion': ['gini','entropy'],
                'max_features': ['sqrt','log2'],
                'class_weight': ['balanced'],
                'max_depth': [2, 3, 4, 10],
                'min_samples_leaf': [2,3,4],
            }

##### Original Summary Features

In [269]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col1, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [32]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col1, scoring='accuracy', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [36]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col1, scoring='recall', t=2)

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 100}

In [42]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col1, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'n_estimators': 200}

##### Original Bag of Words (TFIDF, Vocab = Top 5000)

In [27]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, tfs=True, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

In [49]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, tfs=True, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

##### Topic Summary Features

In [23]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col2, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [30]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col2, scoring='accuracy', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [37]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col2, scoring='recall', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'n_estimators': 100}

In [43]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, col=col2, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 100}

##### Hygiene (Topic) Bag of Words (Vocab = Top 5000)

In [57]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, tfs_h=True, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 200}

In [58]:
%%time
HM.grid_search_classifier(RF, param_grid_rfc, tfs_h=True, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'n_estimators': 100}

### SVC Models

In [34]:
LSVC = LinearSVC()
param_grid_svc = {
                'C': np.logspace(-2, 1, 10),
                'loss': ['squared_hinge'],
                'penalty': ['l1'],
                'dual': [False],
                'random_state': [981]
            }

##### Original Summary Features

In [None]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col1, scoring='precision', t=2)

In [269]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col1, scoring='accuracy', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [269]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col1, scoring='recall', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [269]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col1, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

##### Original Bag of Words (TFIDF, Vocab = Top 5000)

In [27]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, tfs=True, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

In [27]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, tfs=True, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

##### Topic Summary Features

In [None]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col2, scoring='precision', t=2)

In [None]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col2, scoring='accuracy', t=2)

In [None]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col2, scoring='recall', t=2)

In [None]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, col=col2, scoring='f1', t=2)

##### Hygiene (Topic) Bag of Words (Vocab = Top 5000)

In [27]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, tfs_h=True, scoring='precision', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

In [27]:
%%time
HM.grid_search_classifier(LSVC, param_grid_svc, tfs_h=True, scoring='f1', t=2)

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

## Hygiene (Topic) Bag of Words (Vocab = Top 5000)

In [18]:
%%time
tfs_h, tfidf_h = lib_tfidf.yelp_tfidf(sentence_map, 'hygiene_text')

CPU times: user 8.32 s, sys: 635 ms, total: 8.95 s
Wall time: 11.3 s


In [55]:
tfs_h

<23487x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 92155 stored elements in Compressed Sparse Row format>

In [19]:
h_train, h_test = train_test_split(tfs_h, train_size=0.7, random_state=981)

In [249]:
h_train.shape

(16440, 5000)

In [250]:
df_train.shape

(16440, 21)

In [290]:
models = [model_rfc, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2641,343,3755,308,0.576557,0.171111,0.423443,0.473118,0.104442,RandomForestClassifier
2,1022,36,5986,3,0.849865,0.005639,0.150135,0.076923,0.002927,RandomForestClassifier
3,334,24,6689,0,0.949198,0.0,0.050802,0.0,0.0,RandomForestClassifier
4,96,12,6939,0,0.984674,0.0,0.015326,0.0,0.0,RandomForestClassifier
5,23,4,7020,0,0.996169,0.0,0.003831,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,2420,629,3469,529,0.567334,0.257609,0.432666,0.456822,0.179383,LogisticRegression
2,885,687,5335,140,0.776926,0.151188,0.223074,0.169287,0.136585,LogisticRegression
3,305,545,6168,29,0.879381,0.063877,0.120619,0.050523,0.086826,LogisticRegression
4,92,209,6742,4,0.957287,0.02589,0.042713,0.018779,0.041667,LogisticRegression


In [62]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 3,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2289,746,3352,660,0.56932,0.3031,0.43068,0.469417,0.223805,RandomForestClassifier
2,856,808,5214,169,0.763871,0.168831,0.236129,0.172979,0.164878,RandomForestClassifier
3,290,675,6038,44,0.863062,0.083571,0.136938,0.061196,0.131737,RandomForestClassifier
4,92,281,6670,4,0.94707,0.020997,0.05293,0.014035,0.041667,RandomForestClassifier
5,23,97,6927,0,0.982971,0.0,0.017029,0.0,0.0,RandomForestClassifier
6,7,0,7040,0,0.999007,0.0,0.000993,0.0,0.0,RandomForestClassifier


In [64]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'entropy',
                                     'max_depth': 2,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 3,
                                     'n_estimators': 100,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2540,468,3630,409,0.573152,0.2138,0.426848,0.466363,0.138691,RandomForestClassifier
2,895,613,5409,130,0.786008,0.147059,0.213992,0.174966,0.126829,RandomForestClassifier
3,300,561,6152,34,0.87782,0.073197,0.12218,0.057143,0.101796,RandomForestClassifier
4,91,232,6719,5,0.954165,0.03003,0.045835,0.021097,0.052083,RandomForestClassifier
5,23,138,6886,0,0.977153,0.0,0.022847,0.0,0.0,RandomForestClassifier
6,7,30,7010,0,0.99475,0.0,0.00525,0.0,0.0,RandomForestClassifier


In [65]:
pd.get_dummies()

(16440, 21)

In [29]:
from merge_main import open_pickle, save_to_pickle

In [35]:
d = {'df_AZ':df_AZ_h, 'tfs':tfs, 'tfidf_vocab':tfidf.get_feature_names(), 
     'tfs_h':tfs_h, 'tfidf_h':tfidf_h.get_feature_names(), 
     'labels':labels, 'vocab':vocabulary}
save_to_pickle(d, '../data/phx/model_data_phx.pkl')

In [30]:
sentence_map.shape

(23487, 16)

In [31]:
df_AZ.shape

(23487, 21)

In [32]:
sentence_map.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 0 to 23486
Data columns (total 16 columns):
business_id     23487 non-null object
id_             23487 non-null object
inspec_id       23487 non-null object
n_sentences     23487 non-null int64
tokens          23487 non-null object
s_cumulative    23487 non-null int64
aspects         23487 non-null object
sentences       23487 non-null object
hygiene_text    23487 non-null object
n_food          23487 non-null float64
n_hygiene       23487 non-null float64
n_location      23487 non-null float64
n_premise       23487 non-null float64
n_quality       23487 non-null float64
n_service       23487 non-null float64
n_value         23487 non-null float64
dtypes: float64(7), int64(2), object(7)
memory usage: 3.0+ MB


In [33]:
cols = ['business_id','id_','inspec_id','aspects','sentences','hygiene_text',
        'n_food','n_hygiene','n_location','n_premise','n_quality','n_service','n_value']
df_AZ_h = pd.merge(df_AZ, sentence_map[cols], on=['business_id','id_','inspec_id'])
print df_AZ_h.shape
df_AZ_h.info()

(23487, 31)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 0 to 23486
Data columns (total 31 columns):
business_id       23487 non-null object
id_               23487 non-null object
date              23487 non-null object
inspec_id         23487 non-null object
n_priority        23487 non-null object
grade             23487 non-null object
purpose           23487 non-null object
n_violations      16427 non-null float64
v_core            23487 non-null int64
sum_core          23487 non-null int64
v_foundation      23487 non-null int64
sum_foundation    23487 non-null int64
v_priority        23487 non-null int64
sum_priority      23487 non-null int64
date_start        23487 non-null datetime64[ns]
rev_ct            23487 non-null int64
neg_ct            23487 non-null int64
stars_avg         23487 non-null float64
rev_len_avg       23487 non-null float64
stars_var         23487 non-null float64
text              23487 non-null object
aspects           23487 non-null ob

# USE CLASS FOR ABOVE TASKS:

In [45]:
import model_phoenix as lib

In [57]:
reload(lib)

<module 'model_phoenix' from 'model_phoenix.py'>

In [58]:
HM = lib.HealthModel(df_AZ_h, tfs, tfs_h, tfs_vocab=tfidf.get_feature_names(), tfs_h_vocab=tfidf_h.get_feature_names())

In [59]:
col1 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var']
col2 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var','n_hygiene','n_service',
        'n_location','n_food','n_premise','n_quality','n_value']

In [60]:
test = HM.get_features(col=col1)

In [62]:
%%time
# Unoptimized, original features
HM.compare_models([model_rfc, model_log, model_svc], ['unoptimized','unoptimized','unoptimized'],col=col1)

  warn("Some inputs do not have OOB scores. "
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,comments
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier,unoptimized
2,944,455,5567,81,0.801476,0.10378,0.198524,0.151119,0.079024,RandomForestClassifier,unoptimized
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier,unoptimized
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier,unoptimized
5,23,5,7019,0,0.996027,0.0,0.003973,0.0,0.0,RandomForestClassifier,unoptimized
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,unoptimized
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression,unoptimized
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression,unoptimized
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression,unoptimized
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression,unoptimized


In [63]:
%%time
col = [col1, None, col2, None]
tfs = [False, True, False, False]
tfs_h = [False, False, False, True]
tags = ['Original Features', 'All TFIDF', 'Topic Summary Features', 'Health TFIDF']
results = []
models = [model_rfc, model_log, model_svc]
model_comments = ['unoptimized','unoptimized','unoptimized']
for i, j, k, l in zip(col, tfs, tfs_h, tags):
    model_tags = [x+', '+l for x in model_comments]
    results.append(HM.compare_models(models, model_tags, col=i, tfs=j, tfs_h=k))
    
display(pd.concat(results))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,comments
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier,"unoptimized, Original Features"
2,944,455,5567,81,0.801476,0.103780,0.198524,0.151119,0.079024,RandomForestClassifier,"unoptimized, Original Features"
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier,"unoptimized, Original Features"
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier,"unoptimized, Original Features"
5,23,5,7019,0,0.996027,0.000000,0.003973,0.000000,0.000000,RandomForestClassifier,"unoptimized, Original Features"
6,7,1,7039,0,0.998865,0.000000,0.001135,0.000000,0.000000,RandomForestClassifier,"unoptimized, Original Features"
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.448910,0.446931,LogisticRegression,"unoptimized, Original Features"
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression,"unoptimized, Original Features"
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression,"unoptimized, Original Features"
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.500000,LogisticRegression,"unoptimized, Original Features"


CPU times: user 6min 20s, sys: 1min 6s, total: 7min 27s
Wall time: 4min 39s


In [None]:
param_grid_rf = {
                'n_estimators':[100, 200],
                'criterion': ['gini','entropy'],
                'max_features': ['sqrt','log2'],
                'class_weight': ['balanced'],
                'max_depth': [2, 3, 4, 10],
                'min_samples_leaf': [2,3,4],
            }

In [64]:
%%time
HM.grid_search_classifier(RandomForestClassifier(), param_grid_rf, col=col1, tfs=False, tfs_h=False, scoring='f1', t=2)

NameError: name 'param_grid_rf' is not defined