In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import string
import json
pd.options.mode.chained_assignment = None  # default='warn'
from merge_main import open_pickle, save_to_pickle
from import_yelp_mongo import get_yelp_reviews, get_yelp_reviews_afterdate

%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score,\
                            confusion_matrix, classification_report, mean_squared_error
from sklearn.grid_search import GridSearchCV
import yelp_tfidf as lib_tfidf

In [2]:
from IPython.display import display

# Model, Create X, y:

# Phoenix, AZ -- Classification:

In [3]:
df_AZ = open_pickle('../data/phx/phoenix_yelp_features.pkl')

In [4]:
df_AZ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 2 to 37535
Data columns (total 21 columns):
business_id       23487 non-null object
id_               23487 non-null object
date              23487 non-null object
inspec_id         23487 non-null object
n_priority        23487 non-null object
grade             23487 non-null object
purpose           23487 non-null object
n_violations      16427 non-null float64
v_core            23487 non-null int64
sum_core          23487 non-null int64
v_foundation      23487 non-null int64
sum_foundation    23487 non-null int64
v_priority        23487 non-null int64
sum_priority      23487 non-null int64
date_start        23487 non-null datetime64[ns]
rev_ct            23487 non-null int64
neg_ct            23487 non-null int64
stars_avg         23487 non-null float64
rev_len_avg       23487 non-null float64
stars_var         23487 non-null float64
text              23487 non-null object
dtypes: datetime64[ns](1), float64(4), int64(8)

In [5]:
print pd.to_datetime(df_AZ.date).min()
print pd.to_datetime(df_AZ.date).max()
print df_AZ.date_start.min()
print df_AZ.date_start.max()

2012-10-23 00:00:00
2015-10-14 00:00:00
2012-04-24 00:00:00
2015-01-08 00:00:00


In [6]:
def create_target(df, col, t):
    target = (df[col] >= t).astype(int)
    return target.values

def create_model_inputs(df, col, t):
    X = df[['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var']].values#,'n_hygiene','n_service']].values#,
            #'n_location','n_food','n_premise','n_quality','n_value']].values
    y = create_target(df, col, t)
    return X, y

def create_model_inputs2(df, col, t):
    X = df[['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var','n_hygiene','n_service',
            'n_location','n_food','n_premise','n_quality','n_value']].values
    y = create_target(df, col, t)
    return X, y

def get_model_inputs(df_train, df_test, col, t, tfs=None):
    X_train, y_train = create_model_inputs(df_train, col, t)
    X_test, y_test = create_model_inputs(df_test, col, t)
    if tfs is not None:
        X_train = tfs[0]
        X_test = tfs[1]
    return X_train, y_train, X_test, y_test

def get_model_inputs2(df_train, df_test, col, t, tfs=None):
    X_train, y_train = create_model_inputs2(df_train, col, t)
    X_test, y_test = create_model_inputs2(df_test, col, t)
    if tfs is not None:
        X_train = tfs[0]
        X_test = tfs[1]
    return X_train, y_train, X_test, y_test
    

def print_metrics(y_true, y_pred):
    print '%.4f   : Accuracy' % accuracy_score(y_true, y_pred)
    print '%.4f   : Precision' % precision_score(y_true, y_pred)
    print '%.4f   : Recall' % recall_score(y_true, y_pred)
    print '%.4f   : f1_score' % f1_score(y_true, y_pred)
    print '%.4f   : MSE' % mean_squared_error(y_true, y_pred)
    print '%s   : Confusion matrix' % confusion_matrix(y_true, y_pred)
    
def save_metrics(y_true, y_pred):
    d = {'accuracy': accuracy_score(y_true, y_pred),
         'precision': precision_score(y_true, y_pred), 
         'recall': recall_score(y_true, y_pred),
         'f1': f1_score(y_true, y_pred),
         'mse': mean_squared_error(y_true, y_pred)}
    CM = confusion_matrix(y_true, y_pred)
    d.update({'TN': CM[0,0],
              'FP': CM[0,1],
              'FN': CM[1,0],
              'TP': CM[1,1],
             })
    return d

def train_classifier(model, df_train, df_test, col, t, tfs=None):
    X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, col, t, tfs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    d = save_metrics(y_test, y_pred)
    #d.update({'oob':model_rf.oob_score_})
    return {t: d}

def model_classifier(model, df_train, df_test, col, val_range, tfs=None):
    d = {}
    for t in val_range:
        d.update(train_classifier(model, df_train, df_test, col, t, tfs))
    return pd.DataFrame.from_records(d).T

def train_classifier2(model, df_train, df_test, col, t, tfs=None):
    X_train, y_train, X_test, y_test = get_model_inputs2(df_train, df_test, col, t, tfs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    d = save_metrics(y_test, y_pred)
    #d.update({'oob':model_rf.oob_score_})
    return {t: d}

def model_classifier2(model, df_train, df_test, col, val_range, tfs=None):
    d = {}
    for t in val_range:
        d.update(train_classifier2(model, df_train, df_test, col, t, tfs))
    return pd.DataFrame.from_records(d).T


In [7]:
model_rfc = RandomForestClassifier(oob_score=True, 
                                   random_state = 981, 
                                   class_weight='balanced',
                                   n_jobs=-1)
model_log = LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=981)
model_svc = LinearSVC(C=0.19, random_state = 981, class_weight='balanced')

In [8]:
model_rfr = RandomForestRegressor(oob_score=True, 
                                  random_state = 981,
                                  max_features='sqrt',
                                  n_jobs=-1)
model_lin = LinearRegression(n_jobs=-1)
model_svr = LinearSVR(C=0.19, random_state = 981)

In [9]:
reload(lib_tfidf)

<module 'yelp_tfidf' from 'yelp_tfidf.pyc'>

In [10]:
%%time
tfs, tfidf = lib_tfidf.yelp_tfidf(df_AZ,'text')

CPU times: user 5min 31s, sys: 6.19 s, total: 5min 37s
Wall time: 5min 40s


In [11]:
df_train, df_test, tfs_train, tfs_test = train_test_split(df_AZ, tfs, 
                                                          train_size=0.7, random_state=981)

### Models Using Original Features

#### Un-optimized

In [46]:
models = [model_rfc, model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier
2,944,455,5567,81,0.801476,0.10378,0.198524,0.151119,0.079024,RandomForestClassifier
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier
5,23,5,7019,0,0.996027,0.0,0.003973,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [285]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 4,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1}), 
          model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1699,1513,2585,1250,0.544203,0.437675,0.455797,0.452407,0.423872,RandomForestClassifier
2,540,2468,3554,485,0.573152,0.243841,0.426848,0.16424,0.473171,RandomForestClassifier
3,166,2960,3753,168,0.556407,0.097054,0.443593,0.053708,0.502994,RandomForestClassifier
4,47,2485,4466,49,0.640698,0.037262,0.359302,0.019337,0.510417,RandomForestClassifier
5,15,1552,5472,8,0.777636,0.010107,0.222364,0.005128,0.347826,RandomForestClassifier
6,7,176,6864,0,0.974032,0.0,0.025968,0.0,0.0,RandomForestClassifier
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression


In [34]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1}), 
          model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1531,1705,2393,1418,0.540798,0.467062,0.459202,0.454051,0.480841,RandomForestClassifier
2,727,1415,4607,298,0.696041,0.217677,0.303959,0.173964,0.290732,RandomForestClassifier
3,283,820,5893,51,0.843479,0.084647,0.156521,0.058553,0.152695,RandomForestClassifier
4,92,126,6825,4,0.969065,0.035398,0.030935,0.030769,0.041667,RandomForestClassifier
5,23,23,7001,0,0.993472,0.0,0.006528,0.0,0.0,RandomForestClassifier
6,7,2,7038,0,0.998723,0.0,0.001277,0.0,0.0,RandomForestClassifier
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression


In [38]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'entropy',
                                     'max_depth': 2,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 100,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
          #model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1942,1208,2890,1007,0.553001,0.390008,0.446999,0.454628,0.341472,RandomForestClassifier
2,504,2659,3363,521,0.551157,0.2478,0.448843,0.163836,0.508293,RandomForestClassifier
3,139,3419,3294,195,0.495104,0.098784,0.504896,0.053957,0.583832,RandomForestClassifier
4,35,3273,3678,61,0.53058,0.035569,0.46942,0.018296,0.635417,RandomForestClassifier
5,10,3209,3815,13,0.54321,0.008012,0.45679,0.004035,0.565217,RandomForestClassifier
6,6,878,6162,1,0.874557,0.002257,0.125443,0.001138,0.142857,RandomForestClassifier


In [44]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'random_state': 981,
                                     'n_jobs': -1})]#, 
          #model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1943,1206,2892,1006,0.553143,0.389847,0.446857,0.454792,0.341133,RandomForestClassifier
2,507,2629,3393,518,0.554988,0.248322,0.445012,0.164601,0.505366,RandomForestClassifier
3,138,3484,3229,196,0.486022,0.097658,0.513978,0.053261,0.586826,RandomForestClassifier
4,35,3262,3689,61,0.532141,0.035683,0.467859,0.018357,0.635417,RandomForestClassifier
5,10,2930,4094,13,0.582801,0.008766,0.417199,0.004417,0.565217,RandomForestClassifier
6,5,822,6218,2,0.882645,0.004813,0.117355,0.002427,0.285714,RandomForestClassifier


### Models using Original TFIDF feature matrix (5000 top words)

#### Unoptimized

In [47]:
models = [model_rfc, model_log, model_svc]
col = 'sum_priority'
unigram = [tfs_train.todense(), tfs_test.todense()]

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2177,814,3284,772,0.575564,0.340463,0.424436,0.486759,0.261784,RandomForestClassifier
2,1005,41,5981,20,0.851568,0.036832,0.148432,0.327869,0.019512,RandomForestClassifier
3,333,10,6703,1,0.951327,0.005797,0.048673,0.090909,0.002994,RandomForestClassifier
4,95,3,6948,1,0.986093,0.02,0.013907,0.25,0.010417,RandomForestClassifier
5,23,2,7022,0,0.996452,0.0,0.003548,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,1330,1680,2418,1619,0.572868,0.518246,0.427132,0.490755,0.549,LogisticRegression
2,558,1800,4222,467,0.66539,0.283718,0.33461,0.205999,0.45561,LogisticRegression
3,231,1158,5555,103,0.802895,0.129154,0.197105,0.081681,0.308383,LogisticRegression
4,80,419,6532,16,0.92919,0.060264,0.07081,0.036782,0.166667,LogisticRegression


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [50]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 3,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981
                                   })]#, 
          #model_log, model_svc]
col = 'sum_priority'
unigram = [tfs_train.todense(), tfs_test.todense()]

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1706,1257,2841,1243,0.579537,0.456231,0.420463,0.4972,0.421499,RandomForestClassifier
2,929,301,5721,96,0.825458,0.135021,0.174542,0.241814,0.093659,RandomForestClassifier
3,330,26,6687,4,0.949482,0.021978,0.050518,0.133333,0.011976,RandomForestClassifier
4,95,3,6948,1,0.986093,0.02,0.013907,0.25,0.010417,RandomForestClassifier
5,23,1,7023,0,0.996594,0.0,0.003406,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier


In [51]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 3,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981
                                   })]#, 
          #model_log, model_svc]
col = 'sum_priority'
unigram = [tfs_train.todense(), tfs_test.todense()]

results = []
for m in models:
    temp = model_classifier(m, df_train, df_test, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1588,1551,2547,1361,0.554562,0.464426,0.445438,0.467376,0.461512,RandomForestClassifier
2,551,2140,3882,474,0.618135,0.260511,0.381865,0.181331,0.462439,RandomForestClassifier
3,208,1777,4936,126,0.71832,0.112651,0.28168,0.066211,0.377246,RandomForestClassifier
4,75,622,6329,21,0.901093,0.056834,0.098907,0.032659,0.21875,RandomForestClassifier
5,23,10,7014,0,0.995317,0.0,0.004683,0.0,0.0,RandomForestClassifier
6,7,0,7040,0,0.999007,0.0,0.000993,0.0,0.0,RandomForestClassifier


In [25]:
col = 'sum_priority'
X_train = tfs_train.todense()
X_test = tfs_test.todense()
y_train = df_train[col].values
y_test = df_test[col].values
model_svc = LinearSVC(C=0.19, random_state = 981, class_weight='balanced')
model_svc.fit(X_train, y_train)
y_pred = model_svc.predict(X_test)
print confusion_matrix(y_test, y_pred)

[[4449  617  373  364  177   42    1    1    0    0]
 [1519  308  171  180   83   17    3    0    0    0]
 [ 503  123   98   88   47   10    0    0    0    0]
 [ 150   43   26   30   18    9    0    0    0    0]
 [  41   15   11   12    8    2    0    1    0    0]
 [   8    2    4    1    3    2    0    0    0    1]
 [   4    1    1    1    3    0    0    0    0    0]
 [   2    0    0    1    0    0    0    0    0    0]
 [   0    0    0    2    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]]


## Negative (Unhygienic) Business Profile/Clustering

In [12]:
df_neg = df_train[df_train.sum_priority > 1]
tfs_neg = tfs_train[(df_train.sum_priority > 1).nonzero()[0],:]

In [138]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfs_neg)

clusters = km.labels_.tolist()

CPU times: user 3.72 s, sys: 107 ms, total: 3.82 s
Wall time: 3.87 s


In [139]:
df_neg.columns

Index([   u'business_id',            u'id_',           u'date',
            u'inspec_id',     u'n_priority',          u'grade',
              u'purpose',   u'n_violations',         u'v_core',
             u'sum_core',   u'v_foundation', u'sum_foundation',
           u'v_priority',   u'sum_priority',     u'date_start',
               u'rev_ct',         u'neg_ct',      u'stars_avg',
          u'rev_len_avg',      u'stars_var',           u'text'],
      dtype='object')

In [150]:
df_neg['clusters'] = clusters
df_neg.reset_index();

In [141]:
df_neg.clusters.value_counts()

0    1153
2     670
1     231
3     172
4     148
Name: clusters, dtype: int64

In [143]:
df_neg.groupby('clusters').mean()

Unnamed: 0_level_0,n_violations,v_core,sum_core,v_foundation,sum_foundation,v_priority,sum_priority,rev_ct,neg_ct,stars_avg,rev_len_avg,stars_var
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,4.281873,1.079792,1.128361,0.919341,0.958369,2.403296,2.470078,3.291414,3.291414,3.492315,475.451665,1.155232
1,5.047619,1.350649,1.402597,1.212121,1.25974,2.623377,2.731602,10.142857,10.142857,3.805824,533.462881,1.471628
2,4.21791,0.997015,1.040299,0.944776,0.98209,2.408955,2.461194,14.962687,14.962687,3.639035,628.213053,1.622375
3,3.94186,0.936047,0.959302,0.773256,0.796512,2.343023,2.395349,11.034884,11.034884,3.791888,527.347791,1.457882
4,4.358108,1.027027,1.067568,1.243243,1.297297,2.263514,2.317568,10.290541,10.290541,3.611795,513.270411,1.551264


In [156]:
display( "Top terms per cluster:")
display()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    display("Cluster %d words:" % i)
    
    display(' %s' % ', '.join([vocab[ind] for ind in order_centroids[i, :6]]))
    display() #add whitespace
    display() #add whitespace
    
    #display("Cluster %d text:" % i)
    #for text in df_neg[df_neg.clusters==i].head(3)['text'].values.tolist():
        #display(' %s,' % text)
    #display(' %s' % ' ,'.join(df_neg[df_neg.clusters==i]['text'].values.tolist()))
    display() #add whitespace
    display() #add whitespace
    
display()
display()

'Top terms per cluster:'

'Cluster 0 words:'

u' food, good, place, great, order, chicken'

'Cluster 1 words:'

u' sushi, thai, roll, pho, place, food'

'Cluster 2 words:'

u' food, place, good, great, order, veri'

'Cluster 3 words:'

u' breakfast, coffe, good, place, food, egg'

'Cluster 4 words:'

u' pizza, crust, good, order, place, great'

## Topic Segmentation (Classify topic of each sentence)

In [144]:
vocab = tfidf.get_feature_names()

In [13]:
reload(lib_tfidf)

<module 'yelp_tfidf' from 'yelp_tfidf.py'>

In [14]:
%%time
A, labels, aspects, A_new, vocabulary, sentences = lib_tfidf.aspect_segmentation_bootstrap(df_AZ, p=9, I=10, n=40)

(1266660, 93247)
(1266660, 6917)

Iteration 0:
Labeling aspects
Chi-Square
True
['service', 'food', 'value', 'hygiene', 'location', 'premise', 'quality']
---
[u'servic', u'staff', u'server', u'friendli', u'manag', u'waitress', u'attent', u'owner', u'waiter']
[u'food', u'menu', u'tast', u'flavor', u'meal', u'dish', u'good', u'great', u'mexican']
[u'price', u'worth', u'money', u'bill', u'cost', u'valu', u'reason', u'high', u'fair']
[u'clean', u'dirti', u'bathroom', u'hair', u'wipe', u'poison', u'toilet', u'glove', u'restroom']
[u'locat', u'spot', u'local', u'neighborhood', u'support', u'hit', u'scottsdal', u'new', u'conveni']
[u'bar', u'atmospher', u'patio', u'decor', u'room', u'music', u'space', u'environ', u'ambienc']
[u'meal', u'cook', u'portion', u'qualiti', u'prepar', u'perfectli', u'size', u'price', u'gener']

Iteration 1:
Labeling aspects
Chi-Square
True
['service', 'food', 'value', 'hygiene', 'location', 'premise', 'quality']
---
[u'servic', u'friendli', u'staff', u'server', u'ma

In [15]:
%%time
documents = df_AZ.text.apply(lib_tfidf.preprocess_sentences)

CPU times: user 11min 37s, sys: 18.2 s, total: 11min 55s
Wall time: 11min 53s


In [16]:
def get_sentences(n, i, s=None):
    return [s[i] for i in xrange(i-n, i)]

def get_aspects(n, i, aspects=None):
    return aspects[(i-n):i, :]

def count_topic(aspects, labels=None):
    d = {}
    s = aspects.sum(axis=0)
    for j in xrange(len(labels)):
        d['n_'+labels[j]] = s[j]
    return pd.Series(d)

def hygiene_text(a, s, labels=None):
    h = labels.index('hygiene')
    ind = a[:, h].nonzero()[0]
    if len(a) > 0:
        return ' '.join([s[j] for j in ind])
    else:
        return ''

sentence_map = pd.DataFrame({'tokens':documents, 'n_sentences':documents.apply(len), 
                             'id_':df_AZ.id_, 'business_id':df_AZ.business_id, 'inspec_id':df_AZ.inspec_id})

sentence_map['s_cumulative'] = sentence_map.n_sentences.cumsum()
sentence_map.reset_index(drop=True, inplace=True)
    
sentence_map['aspects'] = pd.Series(zip(sentence_map.n_sentences, sentence_map.s_cumulative)).\
    apply(lambda x: get_aspects(*x, aspects=aspects))
    
sentence_map['sentences'] = pd.Series(zip(sentence_map.n_sentences, sentence_map.s_cumulative)).\
    apply(lambda x: get_sentences(*x, s=sentences))
    
sentence_map['hygiene_text'] = pd.Series(zip(sentence_map.aspects, sentence_map.sentences)).\
    apply(lambda x: hygiene_text(*x, labels=labels))
    
tally=sentence_map.aspects.apply(lambda x: count_topic(x, labels=labels))

sentence_map = pd.concat([sentence_map, tally], axis=1)

In [17]:
sentence_map.tail()

Unnamed: 0,business_id,id_,inspec_id,n_sentences,tokens,s_cumulative,aspects,sentences,hygiene_text,n_food,n_hygiene,n_location,n_premise,n_quality,n_service,n_value
23482,iYr8t8RVPCiWABBO4I6HgA,FD-20353,3516740,6,"[[would, 5, star, bathroom, come], [young, his...",1266628,"[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0....",[This would have been 5 stars but the bathroom...,This would have been 5 stars but the bathrooms...,0,1,0,0,0,0,0
23483,iYr8t8RVPCiWABBO4I6HgA,FD-20353,3421094,7,"[[food, good, prefer, taco, bell, meal, deal, ...",1266635,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1....","[Food is good I prefer Taco Bell, the meal dea...",,3,0,1,0,0,0,0
23484,iYr8t8RVPCiWABBO4I6HgA,FD-20353,3378540,9,"[[appar, discontinu, jalapeno, ring, jalapeno,...",1266644,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....",[Apparently they have now discontinued the jal...,I've never gotten food poisoning here.,2,1,0,0,0,0,0
23485,itk__2hEY8FRMkO3lI1kag,FD-25828,3724118,11,"[[great, option, need, feed, hungri, kid, dont...",1266655,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0....",[Great option when you need to feed hungry kid...,,1,0,0,0,2,2,1
23486,itk__2hEY8FRMkO3lI1kag,FD-25828,3437930,5,"[[quick, takeout, pizza, mani, varieti, alway,...",1266660,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....",[Quick takeout pizza not many varieties but it...,,2,0,0,0,0,0,1


In [17]:
cols = ['n_food', 'n_hygiene', 'n_location', 'n_premise', 'n_quality', 'n_service', 'n_value', 'hygiene_text']
keys = ['business_id','id_','inspec_id']
df_train2 = pd.merge(df_train, sentence_map[cols+keys], on=keys, how='left')
df_test2 = pd.merge(df_test, sentence_map[cols+keys], on=keys, how='left')
df_train2.head()

Unnamed: 0,business_id,id_,date,inspec_id,n_priority,grade,purpose,n_violations,v_core,sum_core,...,stars_var,text,n_food,n_hygiene,n_location,n_premise,n_quality,n_service,n_value,hygiene_text
0,snHl38pR9OyllL6hMuufOA,FD-34871,01/26/2015,3673241,1.0,Not Participating,Routine Inspection,1.0,0,0,...,0.0,Loaded fries with bacon and cheddar mmmmmmm. T...,1,0,0,0,0,0,0,
1,-xFO1E3OiDMmdqdjwUM_DA,FD-03053,11/27/2013,3504567,,Not Participating,Routine Inspection,3.0,0,0,...,0.7,Okay food. \nBeans are sometimes old with a li...,7,0,1,0,0,0,1,
2,MAfc2V_EVtyR9rMxxEAPLg,FD-10245,03/31/2014,3550776,,Not Participating,Routine Inspection,2.0,1,1,...,1.066667,"Excellent Pho, 1st time here, came with Vietna...",27,1,3,0,5,6,2,The place is clean and they serve Chinese food...
3,T_FMm8BNUIfUOhE-ScV15w,FD-29060,06/07/2013,3432310,,A,Routine Inspection,2.0,2,2,...,4.333333,The only positive I can give is that the water...,10,0,1,0,5,3,2,
4,Vz-PukBDv5j1UDOYbMbb1w,FD-13914,03/12/2013,3394769,,Not Participating,Routine Inspection,,0,0,...,2.333333,Food here was ok\n\nSalad bar has good variety...,3,0,1,6,0,2,1,


In [239]:
for col in cols:
    display(df_train2.groupby('sum_priority')[col].agg({col:lambda ts: (ts > 0).sum()}).T)

sum_priority,0,1,2,3,4,5,6,7,8,10
n_food,9039,4096,1533,524,146,48,18,6,3,2


sum_priority,0,1,2,3,4,5,6,7,8,10
n_hygiene,2814,1332,512,211,58,21,6,3,2,2


sum_priority,0,1,2,3,4,5,6,7,8,10
n_location,5822,2652,1017,344,100,32,11,3,2,2


sum_priority,0,1,2,3,4,5,6,7,8,10
n_premise,4288,2095,837,287,93,27,7,2,3,1


sum_priority,0,1,2,3,4,5,6,7,8,10
n_quality,6280,2937,1105,395,118,36,13,5,3,2


sum_priority,0,1,2,3,4,5,6,7,8,10
n_service,7694,3490,1317,450,130,39,16,5,3,2


sum_priority,0,1,2,3,4,5,6,7,8,10
n_value,5740,2672,1051,367,114,34,15,4,2,2


### Models using Topic summary features:

#### Unoptimized

In [20]:
models = [model_rfc, model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

  warn("Some inputs do not have OOB scores. "
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2124,990,3108,825,0.55811,0.346348,0.44189,0.454545,0.279756,RandomForestClassifier
2,998,105,5917,27,0.843479,0.046672,0.156521,0.204545,0.026341,RandomForestClassifier
3,333,23,6690,1,0.949482,0.005587,0.050518,0.041667,0.002994,RandomForestClassifier
4,95,5,6946,1,0.98581,0.019608,0.01419,0.166667,0.010417,RandomForestClassifier
5,23,2,7022,0,0.996452,0.0,0.003548,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,1808,1383,2715,1141,0.547183,0.416956,0.452817,0.45206,0.386911,LogisticRegression
2,592,2253,3769,433,0.596282,0.23336,0.403718,0.161206,0.422439,LogisticRegression
3,202,2312,4401,132,0.643252,0.095032,0.356748,0.05401,0.39521,LogisticRegression
4,53,2418,4533,43,0.649354,0.033633,0.350646,0.017473,0.447917,LogisticRegression


#### Optimized (Gridsearch CV best parameters for `sum_priority >= 2`)

In [33]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 10,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 2,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981}), 
          model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1651,1528,2570,1298,0.548886,0.449524,0.451114,0.459306,0.440149,RandomForestClassifier
2,810,1004,5018,215,0.742585,0.191622,0.257415,0.176374,0.209756,RandomForestClassifier
3,304,363,6350,30,0.90535,0.082531,0.09465,0.076336,0.08982,RandomForestClassifier
4,94,49,6902,2,0.979708,0.027211,0.020292,0.039216,0.020833,RandomForestClassifier
5,23,22,7002,0,0.993614,0.0,0.006386,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,1808,1383,2715,1141,0.547183,0.416956,0.452817,0.45206,0.386911,LogisticRegression
2,592,2255,3767,433,0.595998,0.233235,0.404002,0.161086,0.422439,LogisticRegression
3,202,2312,4401,132,0.643252,0.095032,0.356748,0.05401,0.39521,LogisticRegression
4,53,2418,4533,43,0.649354,0.033633,0.350646,0.017473,0.447917,LogisticRegression


In [39]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 100,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
          #model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1618,1605,2493,1331,0.542642,0.452336,0.457358,0.453338,0.451339,RandomForestClassifier
2,483,2735,3287,542,0.543352,0.251976,0.456648,0.165395,0.52878,RandomForestClassifier
3,148,3288,3425,186,0.512417,0.097689,0.487583,0.053541,0.556886,RandomForestClassifier
4,43,3126,3825,53,0.550305,0.032366,0.449695,0.016672,0.552083,RandomForestClassifier
5,14,2602,4422,9,0.628778,0.006834,0.371222,0.003447,0.391304,RandomForestClassifier
6,6,743,6297,1,0.893714,0.002663,0.106286,0.001344,0.142857,RandomForestClassifier


In [45]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 2,
                                     'max_features': 'sqrt',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, 
          #model_log, model_svc]
col = 'sum_priority'

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7))
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_1 = pd.concat(results)
display(results_1)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1569,1665,2433,1380,0.541081,0.46046,0.458919,0.453202,0.467955,RandomForestClassifier
2,470,2801,3221,555,0.535831,0.253367,0.464169,0.165375,0.541463,RandomForestClassifier
3,147,3287,3426,187,0.5127,0.098214,0.4873,0.053828,0.55988,RandomForestClassifier
4,36,3279,3672,60,0.529587,0.034934,0.470413,0.017969,0.625,RandomForestClassifier
5,13,2712,4312,10,0.613311,0.007286,0.386689,0.003674,0.434783,RandomForestClassifier
6,6,745,6295,1,0.89343,0.002656,0.10657,0.00134,0.142857,RandomForestClassifier


In [21]:
from sklearn.grid_search import GridSearchCV

In [52]:
param_grid = {
                'n_estimators':[100, 200],
                'criterion': ['gini','entropy'],
                'max_features': ['sqrt','log2'],
                'class_weight': ['balanced'],
                'max_depth': [2, 3, 4, 10],
                'min_samples_leaf': [2,3,4],
            }
grid = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='precision')
grid2 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1)
grid3 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='recall')
grid4 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='f1')
grid_tfs = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='precision')
grid_tfs4 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='f1')
grid_h = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='precision')
grid_h2 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1)
grid_h3 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='recall')
grid_h4 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='f1')
grid_tfs_h = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='precision')
grid_tfs_h4 = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, scoring='f1')

##### Original Summary Features

In [269]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2)
grid.fit(X_train, y_train) #precision
grid.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [32]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2)
grid2.fit(X_train, y_train) #accuracy
grid2.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [36]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2)
grid3.fit(X_train, y_train) #recall
grid3.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 100}

In [42]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2)
grid4.fit(X_train, y_train) #f1 score
grid4.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'n_estimators': 200}

##### Original Bag of Words (TFIDF, Vocab = Top 5000)

In [27]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2, [tfs_train.todense(), tfs_test.todense()])
grid_tfs.fit(X_train, y_train) #precision
grid_tfs.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 200}

In [49]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2, [tfs_train.todense(), tfs_test.todense()])
grid_tfs4.fit(X_train, y_train) #f1 score
grid_tfs4.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

##### Topic Summary Features

In [23]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2)
grid_h.fit(X_train, y_train) #precision
grid_h.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [30]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2)
grid_h2.fit(X_train, y_train) #accuracy
grid_h2.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [37]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2)
grid_h3.fit(X_train, y_train) #recall
grid_h3.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'n_estimators': 100}

In [43]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2)
grid_h4.fit(X_train, y_train)
grid_h4.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 100}

##### Hygiene (Topic) Bag of Words (Vocab = Top 5000)

In [57]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2, [h_train.todense(), h_test.todense()])
grid_tfs_h.fit(X_train, y_train)
grid_tfs_h.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 200}

In [58]:
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2, [h_train.todense(), h_test.todense()])
grid_tfs_h4.fit(X_train, y_train)
grid_tfs_h4.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'n_estimators': 100}

In [27]:
param_grid = {
                'C': np.logspace(-2, 1, 10),
                'loss': ['squared_hinge'],
                'penalty': ['l1'],
                'dual': [False],
                'random_state': [981]
            }
gridsvc = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='precision')
gridsvc2 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1)
gridsvc3 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='recall')
gridsvc4 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='f1')
gridsvc_tfs = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='precision')
gridsvc_tfs4 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='f1')
gridsvc_h = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='precision')
gridsvc_h2 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1)
gridsvc_h3 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='recall')
gridsvc_h4 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='f1')
gridsvc_tfs_h = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='precision')
gridsvc_tfs_h4 = GridSearchCV(LinearSVC(), param_grid, n_jobs=-1, scoring='f1')

In [28]:
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2)
gridsvc4.fit(X_train, y_train) #f1 score
gridsvc4.best_params_

ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information

In [None]:
%%time
X_train, y_train, X_test, y_test = get_model_inputs(df_train, df_test, 'sum_priority', 2, [tfs_train.todense(), tfs_test.todense()])
gridsvc_tfs4.fit(X_train, y_train) #f1 score
gridsvc_tfs4.best_params_

In [None]:
%%time
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2)
gridsvc_h4.fit(X_train, y_train) #f1 score
gridsvc_h4.best_params_

In [None]:
%%time
X_train, y_train, X_test, y_test = get_model_inputs2(df_train2, df_test2, 'sum_priority', 2, [h_train.todense(), h_test.todense()])
gridsvc_tfs_h4.fit(X_train, y_train) #f1 score
gridsvc_tfs_h4.best_params_

## Hygiene (Topic) Bag of Words (Vocab = Top 5000)

In [18]:
%%time
tfs_h, tfidf_h = lib_tfidf.yelp_tfidf(sentence_map, 'hygiene_text')

CPU times: user 8.32 s, sys: 635 ms, total: 8.95 s
Wall time: 11.3 s


In [55]:
tfs_h

<23487x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 92155 stored elements in Compressed Sparse Row format>

In [19]:
h_train, h_test = train_test_split(tfs_h, train_size=0.7, random_state=981)

In [249]:
h_train.shape

(16440, 5000)

In [250]:
df_train.shape

(16440, 21)

In [290]:
models = [model_rfc, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2641,343,3755,308,0.576557,0.171111,0.423443,0.473118,0.104442,RandomForestClassifier
2,1022,36,5986,3,0.849865,0.005639,0.150135,0.076923,0.002927,RandomForestClassifier
3,334,24,6689,0,0.949198,0.0,0.050802,0.0,0.0,RandomForestClassifier
4,96,12,6939,0,0.984674,0.0,0.015326,0.0,0.0,RandomForestClassifier
5,23,4,7020,0,0.996169,0.0,0.003831,0.0,0.0,RandomForestClassifier
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier
1,2420,629,3469,529,0.567334,0.257609,0.432666,0.456822,0.179383,LogisticRegression
2,885,687,5335,140,0.776926,0.151188,0.223074,0.169287,0.136585,LogisticRegression
3,305,545,6168,29,0.879381,0.063877,0.120619,0.050523,0.086826,LogisticRegression
4,92,209,6742,4,0.957287,0.02589,0.042713,0.018779,0.041667,LogisticRegression


In [62]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'gini',
                                     'max_depth': 3,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 4,
                                     'n_estimators': 200,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2289,746,3352,660,0.56932,0.3031,0.43068,0.469417,0.223805,RandomForestClassifier
2,856,808,5214,169,0.763871,0.168831,0.236129,0.172979,0.164878,RandomForestClassifier
3,290,675,6038,44,0.863062,0.083571,0.136938,0.061196,0.131737,RandomForestClassifier
4,92,281,6670,4,0.94707,0.020997,0.05293,0.014035,0.041667,RandomForestClassifier
5,23,97,6927,0,0.982971,0.0,0.017029,0.0,0.0,RandomForestClassifier
6,7,0,7040,0,0.999007,0.0,0.000993,0.0,0.0,RandomForestClassifier


In [64]:
models = [RandomForestClassifier(**{'class_weight': 'balanced',
                                     'criterion': 'entropy',
                                     'max_depth': 2,
                                     'max_features': 'log2',
                                     'min_samples_leaf': 3,
                                     'n_estimators': 100,
                                     'n_jobs': -1,
                                     'random_state': 981})]#, model_log, model_svc]
col = 'sum_priority'
unigram = [h_train.todense(), h_test.todense()]

results = []
for m in models:
    temp = model_classifier2(m, df_train2, df_test2, col, xrange(1,7), unigram)
    temp.index.name=col
    temp['model'] = str(m.__class__).strip("'>").split('.')[-1]
    results.append(temp)

results_2 = pd.concat(results)
display(results_2)

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2540,468,3630,409,0.573152,0.2138,0.426848,0.466363,0.138691,RandomForestClassifier
2,895,613,5409,130,0.786008,0.147059,0.213992,0.174966,0.126829,RandomForestClassifier
3,300,561,6152,34,0.87782,0.073197,0.12218,0.057143,0.101796,RandomForestClassifier
4,91,232,6719,5,0.954165,0.03003,0.045835,0.021097,0.052083,RandomForestClassifier
5,23,138,6886,0,0.977153,0.0,0.022847,0.0,0.0,RandomForestClassifier
6,7,30,7010,0,0.99475,0.0,0.00525,0.0,0.0,RandomForestClassifier


In [65]:
pd.get_dummies()

(16440, 21)

In [29]:
from merge_main import open_pickle, save_to_pickle

In [35]:
d = {'df_AZ':df_AZ_h, 'tfs':tfs, 'tfidf_vocab':tfidf.get_feature_names(), 
     'tfs_h':tfs_h, 'tfidf_h':tfidf_h.get_feature_names(), 
     'labels':labels, 'vocab':vocabulary}
save_to_pickle(d, '../data/phx/model_data_phx.pkl')

In [30]:
sentence_map.shape

(23487, 16)

In [31]:
df_AZ.shape

(23487, 21)

In [32]:
sentence_map.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 0 to 23486
Data columns (total 16 columns):
business_id     23487 non-null object
id_             23487 non-null object
inspec_id       23487 non-null object
n_sentences     23487 non-null int64
tokens          23487 non-null object
s_cumulative    23487 non-null int64
aspects         23487 non-null object
sentences       23487 non-null object
hygiene_text    23487 non-null object
n_food          23487 non-null float64
n_hygiene       23487 non-null float64
n_location      23487 non-null float64
n_premise       23487 non-null float64
n_quality       23487 non-null float64
n_service       23487 non-null float64
n_value         23487 non-null float64
dtypes: float64(7), int64(2), object(7)
memory usage: 3.0+ MB


In [33]:
cols = ['business_id','id_','inspec_id','aspects','sentences','hygiene_text',
        'n_food','n_hygiene','n_location','n_premise','n_quality','n_service','n_value']
df_AZ_h = pd.merge(df_AZ, sentence_map[cols], on=['business_id','id_','inspec_id'])
print df_AZ_h.shape
df_AZ_h.info()

(23487, 31)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23487 entries, 0 to 23486
Data columns (total 31 columns):
business_id       23487 non-null object
id_               23487 non-null object
date              23487 non-null object
inspec_id         23487 non-null object
n_priority        23487 non-null object
grade             23487 non-null object
purpose           23487 non-null object
n_violations      16427 non-null float64
v_core            23487 non-null int64
sum_core          23487 non-null int64
v_foundation      23487 non-null int64
sum_foundation    23487 non-null int64
v_priority        23487 non-null int64
sum_priority      23487 non-null int64
date_start        23487 non-null datetime64[ns]
rev_ct            23487 non-null int64
neg_ct            23487 non-null int64
stars_avg         23487 non-null float64
rev_len_avg       23487 non-null float64
stars_var         23487 non-null float64
text              23487 non-null object
aspects           23487 non-null ob

# USE CLASS FOR ABOVE TASKS:

In [45]:
import model_phoenix as lib

In [57]:
reload(lib)

<module 'model_phoenix' from 'model_phoenix.py'>

In [58]:
HM = lib.HealthModel(df_AZ_h, tfs, tfs_h, tfs_vocab=tfidf.get_feature_names(), tfs_h_vocab=tfidf_h.get_feature_names())

In [59]:
col1 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var']
col2 = ['rev_ct','neg_ct','stars_avg','rev_len_avg','stars_var','n_hygiene','n_service',
        'n_location','n_food','n_premise','n_quality','n_value']

In [60]:
test = HM.get_features(col=col1)

In [62]:
%%time
# Unoptimized, original features
HM.compare_models([model_rfc, model_log, model_svc], ['unoptimized','unoptimized','unoptimized'],col=col1)

  warn("Some inputs do not have OOB scores. "
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,comments
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier,unoptimized
2,944,455,5567,81,0.801476,0.10378,0.198524,0.151119,0.079024,RandomForestClassifier,unoptimized
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier,unoptimized
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier,unoptimized
5,23,5,7019,0,0.996027,0.0,0.003973,0.0,0.0,RandomForestClassifier,unoptimized
6,7,1,7039,0,0.998865,0.0,0.001135,0.0,0.0,RandomForestClassifier,unoptimized
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.44891,0.446931,LogisticRegression,unoptimized
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression,unoptimized
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression,unoptimized
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.5,LogisticRegression,unoptimized


In [63]:
%%time
col = [col1, None, col2, None]
tfs = [False, True, False, False]
tfs_h = [False, False, False, True]
tags = ['Original Features', 'All TFIDF', 'Topic Summary Features', 'Health TFIDF']
results = []
models = [model_rfc, model_log, model_svc]
model_comments = ['unoptimized','unoptimized','unoptimized']
for i, j, k, l in zip(col, tfs, tfs_h, tags):
    model_tags = [x+', '+l for x in model_comments]
    results.append(HM.compare_models(models, model_tags, col=i, tfs=j, tfs_h=k))
    
display(pd.concat(results))

Unnamed: 0_level_0,FN,FP,TN,TP,accuracy,f1,mse,precision,recall,model,comments
sum_priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1903,1370,2728,1046,0.535547,0.389935,0.464453,0.432947,0.354697,RandomForestClassifier,"unoptimized, Original Features"
2,944,455,5567,81,0.801476,0.103780,0.198524,0.151119,0.079024,RandomForestClassifier,"unoptimized, Original Features"
3,331,104,6609,3,0.938272,0.013605,0.061728,0.028037,0.008982,RandomForestClassifier,"unoptimized, Original Features"
4,95,18,6933,1,0.983965,0.017391,0.016035,0.052632,0.010417,RandomForestClassifier,"unoptimized, Original Features"
5,23,5,7019,0,0.996027,0.000000,0.003973,0.000000,0.000000,RandomForestClassifier,"unoptimized, Original Features"
6,7,1,7039,0,0.998865,0.000000,0.001135,0.000000,0.000000,RandomForestClassifier,"unoptimized, Original Features"
1,1631,1618,2480,1318,0.538953,0.447918,0.461047,0.448910,0.446931,LogisticRegression,"unoptimized, Original Features"
2,526,2569,3453,499,0.560806,0.243831,0.439194,0.162647,0.486829,LogisticRegression,"unoptimized, Original Features"
3,169,2877,3836,165,0.567759,0.097749,0.432241,0.054241,0.494012,LogisticRegression,"unoptimized, Original Features"
4,48,2994,3957,48,0.568327,0.030593,0.431673,0.015779,0.500000,LogisticRegression,"unoptimized, Original Features"


CPU times: user 6min 20s, sys: 1min 6s, total: 7min 27s
Wall time: 4min 39s


In [65]:
param_grid_rf = {
                'n_estimators':[100, 200],
                'criterion': ['gini','entropy'],
                'max_features': ['sqrt','log2'],
                'class_weight': ['balanced'],
                'max_depth': [2, 3, 4, 10],
                'min_samples_leaf': [2,3,4],
            }

In [66]:
%%time
HM.grid_search_classifier(RandomForestClassifier(), param_grid_rf, col=col1, tfs=False, tfs_h=False, scoring='f1', t=2)

ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information