In [2]:
# Import Packages

#Admin

import time
from datetime import datetime
%autocall 1
from geopy.distance import great_circle
from geopy.distance import vincenty

# Analysis

import pandas as pd
import numpy as np

# Modeling
from pygeohash import geohash
from sklearn.preprocessing import CategoricalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import roc_auc_score

# Plots

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus
from ipywidgets import interact, interactive, fixed, interact_manual, FloatSlider
import ipywidgets as widgets
from IPython.display import display, Markdown, Latex, Image

Automatic calling is: Smart


In [3]:
# Import Train Data Set

train = pd.read_csv('./west_nile/input/train.csv', na_values=['M'])

In [4]:
train = train.groupby(['Species','Date','Trap']).mean().reset_index()

In [5]:
train.columns

Index(['Species', 'Date', 'Trap', 'Block', 'Latitude', 'Longitude',
       'AddressAccuracy', 'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [6]:
train.head()

Unnamed: 0,Species,Date,Trap,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,CULEX ERRATICUS,2013-09-06,T900,10.0,41.974689,-87.890615,9.0,7.0,0.0
1,CULEX PIPIENS,2007-05-29,T096,22.0,41.731922,-87.677512,8.0,1.0,0.0
2,CULEX PIPIENS,2007-06-05,T002,41.0,41.95469,-87.800991,9.0,1.0,0.0
3,CULEX PIPIENS,2007-06-05,T045,15.0,41.9216,-87.666455,8.0,1.0,0.0
4,CULEX PIPIENS,2007-06-05,T048,11.0,41.867108,-87.654224,8.0,2.0,0.0


In [7]:
# Import Test Data Set

test = pd.read_csv('./west_nile/input/test.csv', na_values=['M'])

In [8]:
test.head(5)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [9]:
# Import Weather Data Set

weather = pd.read_csv('./west_nile/input/weather.csv', na_values=['M'])

In [10]:
# Do All the Things to Weather Set

weather = weather.drop(['Water1', 'Depart', 'Depth', 'CodeSum'], axis=1)

weather = weather.dropna()

weather = weather.replace('  T', 0)

station1 = weather[weather['Station']==1]

In [11]:
# Do All the Things to Train Set

# Label Encode Columns
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

train['Species'] = encode.fit_transform(train['Species'])

train['Trap'] = encode.fit_transform(train['Trap'])

# train.drop(['Species'], axis=1, inplace=True)

# Combine Latitude and Longitude

train['LatLong'] = list(zip(train.Latitude, train.Longitude))

# Drop Unneccessary Columns

train = train.drop(['Block', 'Latitude',
                    'Longitude', 'AddressAccuracy'], axis=1)

# Merge Weather onto Train Data Set

train_weather = pd.merge(train, station1, how='left', on='Date')

# Transform Date

train_weather['Date'] = train_weather['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

train_weather['Date'] = train_weather['Date'].apply(lambda x: x.timetuple().tm_yday)
              
# Calculate Distances from 2 Major Centroids

centroid1 = (41.974689, -87.890615)
centroid2 = (41.673408, -87.599862)

distances1 = []

for i in train_weather['LatLong']:
    miles = vincenty(centroid1, i).miles
    
    distances1.append(miles)

distances2 = []

for i in train_weather['LatLong']:
    miles = vincenty(centroid2, i).miles
    
    distances2.append(miles)

# Add Those Distances to DataFrame

train_weather['Distances1'] = distances1

train_weather['Distances2'] = distances2

train_weather['Close_to_Centroid2'] = train_weather['Distances2'].map(lambda x: 1 if x < 5.0 else 0)

train_weather = train_weather.drop_duplicates()

train_weather = train_weather.fillna(0)

train_weather['WnvPresent'] = train_weather['WnvPresent'].apply(lambda x: 1 if x > 0 else 0)


In [12]:
# Do All the Things to Test Set

# Encode Species Columns

test['Species'] = encode.fit_transform(test['Species'])

test['Trap'] = encode.fit_transform(test['Trap'])

# Combine Latitude and Longitude

test['LatLong'] = list(zip(test.Latitude, test.Longitude))

# Drop Unneccessary Columns

test = test.drop(['Block', 'Latitude',
                    'Longitude', 'AddressAccuracy'], axis=1)

# Merge Weather onto Test Data Set

test_weather = pd.merge(test, station1, how='left', on='Date')

# Transform Date

test_weather['Date'] = test_weather['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

test_weather['Date'] = test_weather['Date'].apply(lambda x: x.timetuple().tm_yday)

# Calculate Distances from 2 Major Centroids

centroid1 = (41.974689, -87.890615)
centroid2 = (41.673408, -87.599862)

distances1 = []

for i in test_weather['LatLong']:
    miles = vincenty(centroid1, i).miles
    
    distances1.append(miles)

distances2 = []

for i in test_weather['LatLong']:
    miles = vincenty(centroid2, i).miles
    
    distances2.append(miles)

# Add Those Distances to DataFrame

test_weather['Distances1'] = distances1

test_weather['Distances2'] = distances2

test_weather['Close_to_Centroid2'] = test_weather['Distances2'].map(lambda x: 1 if x < 5.0 else 0)

test_weather = test_weather.drop_duplicates()

test_weather = test_weather.fillna(0)

In [13]:
train_weather.columns

Index(['Species', 'Date', 'Trap', 'NumMosquitos', 'WnvPresent', 'LatLong',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Distances1',
       'Distances2', 'Close_to_Centroid2'],
      dtype='object')

In [14]:
# Drop Even More Columns

# train_weather = train_weather.drop(['NumMosquitos', 'LatLong',
#        'Station', 'Tmax', 'Tmin', 'DewPoint', 'Heat',
#        'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
#        'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed'], axis=1)

In [15]:
train_weather.head()

Unnamed: 0,Species,Date,Trap,NumMosquitos,WnvPresent,LatLong,Station,Tmax,Tmin,Tavg,...,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Distances1,Distances2,Close_to_Centroid2
0,0,249,134,7.0,0,"(41.974689, -87.890615)",1.0,86.0,57.0,72.0,...,0.0,0.0,29.38,30.11,4.7,20.0,6.0,0.0,25.644378,0
1,1,149,74,1.0,0,"(41.731922, -87.677512)",1.0,88.0,60.0,74.0,...,0.0,0.0,29.39,30.11,5.8,18.0,6.5,20.040788,5.69531,0
2,1,156,1,1.0,0,"(41.95469, -87.800991)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,4.818417,22.015745,0
3,1,156,32,1.0,0,"(41.9216, -87.666455)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,12.116638,17.470881,0
4,1,156,35,2.0,0,"(41.867108, -87.654224)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,14.268717,13.660062,0


In [16]:
train_weather.columns

Index(['Species', 'Date', 'Trap', 'NumMosquitos', 'WnvPresent', 'LatLong',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Distances1',
       'Distances2', 'Close_to_Centroid2'],
      dtype='object')

In [39]:
# Feature Selection

# define target
target = 'WnvPresent'

# instantiate selector
selector = SelectKBest(score_func=f_classif, k=10)

# subset training data without 'drops'
# train_features = train_weather.drop(drops, axis=1)
train_features = train_weather.drop('WnvPresent', axis=1).select_dtypes(include=['number'])

# subset training target
train_target = train_weather[target]

# fit selector
selector.fit(train_features, train_target)

# extract best feature indexes
best_features = selector.get_support(indices=True)

# convert indexes to feature names
features = list(train_features.columns[selector.get_support(indices = True)])
print(features)

['Species', 'Date', 'NumMosquitos', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat', 'Cool']


In [36]:
train_weather.columns

Index(['Species', 'Date', 'Trap', 'NumMosquitos', 'WnvPresent', 'LatLong',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Distances1',
       'Distances2', 'Close_to_Centroid2'],
      dtype='object')

In [20]:
# Train-Train-Split on Data Set

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

X = train_weather[['Species', 'Date', 'Trap',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Distances1',
       'Distances2', 'Close_to_Centroid2']]
y = train_weather['WnvPresent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Standard Scaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [21]:
# Try to Balance Classes with SMOTEENN

from sklearn.metrics import recall_score
from imblearn.combine import SMOTEENN

sm = SMOTEENN()

X_train, y_train = sm.fit_sample(X_train, y_train)

In [22]:
# RandomForestClassifier

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

rf = RandomForestClassifier()

rf_pipe = Pipeline([
    ('ss', ss),
    ('rf', rf)
])

params = {'rf__n_estimators' : [10, 15, 20],
          'rf__max_depth' : [None, 2, 3, 4, 5]}

rf_gs = GridSearchCV(rf_pipe, param_grid=params, cv=5, scoring='roc_auc')
rf_gs.fit(X_train, y_train)

best_rf_gs = rf_gs.best_estimator_

rf_gs_train = best_rf_gs.score(X_train, y_train)
rf_gs_test = best_rf_gs.score(X_test, y_test)



In [23]:
# XGBoost Classifier

gs_params = {
    'max_depth':[1, 2, 3, 4, 5],
    'n_estimators':range(1, 10, 1),
    'learning_rate':np.logspace(-5,0,5),
    'silent' : [False],
    'booster' : ['gbtree', 'gblinear', 'dart'] 
}

xgb_gs = GridSearchCV(XGBClassifier(), gs_params, cv=5, verbose=1, scoring='roc_auc')

xgb_gs = xgb_gs.fit(X_train, y_train)

best_xgb_gs = xgb_gs.best_estimator_

xgb_gs_train = best_xgb_gs.score(X_train, y_train)
xgb_gs_test = best_xgb_gs.score(X_test, y_test)

Fitting 5 folds for each of 675 candidates, totalling 3375 fits


[Parallel(n_jobs=1)]: Done 3375 out of 3375 | elapsed: 10.7min finished


In [24]:
# BalancedBaggingClassifier

gs_params = {
    'n_estimators' : range(1, 10, 1),
#     'max_samples' : range(1, 10),
#     'max_features' : range(1, 10),
    'warm_start' : [True, False] 
}

bbc_gs = GridSearchCV(BalancedBaggingClassifier(), gs_params, scoring='roc_auc', 
                      cv=5, verbose=1)

bbc_gs = bbc_gs.fit(X_train, y_train)

best_bbc_gs = bbc_gs.best_estimator_

bbc_train = best_bbc_gs.score(X_train, y_train)
bbc_test = best_bbc_gs.score(X_test, y_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.4min finished


In [25]:
X_train.shape

(10664, 23)

In [26]:
# Executive Summary of Models

print('GridSearchCV across Random Forest:')
print(f"Best Parameters = {rf_gs.best_params_}")
print(f"Best CV Score = {rf_gs.best_score_}")
print(f"Train Score = {rf_gs_train}")
print(f"Test Score = {rf_gs_test}")
print()
print('GridSearchCV across XGBoost:')
print(f"Best Parameters = {xgb_gs.best_params_}")
print(f"Best CV Score = {xgb_gs.best_score_}")
print(f"Train Score = {xgb_gs_train}")
print(f"Test Score = {xgb_gs_test}")
print()
print('GridSearchCV across BalancedBaggingClassifier:')
print(f"Best Parameters = {bbc_gs.best_params_}")
print(f"Best CV Score = {bbc_gs.best_score_}")
print(f"Train Score = {bbc_train}")
print(f"Test Score = {bbc_test}")

GridSearchCV across Random Forest:
Best Parameters = {'rf__max_depth': None, 'rf__n_estimators': 20}
Best CV Score = 0.9978169430562099
Train Score = 0.9996249062265566
Test Score = 0.8763567720622936

GridSearchCV across XGBoost:
Best Parameters = {'booster': 'gbtree', 'learning_rate': 1.0, 'max_depth': 5, 'n_estimators': 9, 'silent': False}
Best CV Score = 0.988920398125109
Train Score = 0.9707426856714179
Test Score = 0.852760736196319

GridSearchCV across BalancedBaggingClassifier:
Best Parameters = {'n_estimators': 8, 'warm_start': True}
Best CV Score = 0.9944738072966521
Train Score = 0.997655663915979
Test Score = 0.8824917413874469


In [27]:
def feat_equalize(train, test):
    Tr = X
    Te = test_weather.columns

    # remove any columns in Te that are not in Tr
    Te_not_Tr = [c for c in Te if c not in Tr]
    test_cut = test.drop(Te_not_Tr, axis=1)

    # create column of zeroes in test for any columns in Tr not in Te
    Tr_not_Te = [c for c in Tr if c not in Te]
    for c in Tr_not_Te:
        test_cut[c] = 0

    return test_cut

In [28]:
train_weather.head()

Unnamed: 0,Species,Date,Trap,NumMosquitos,WnvPresent,LatLong,Station,Tmax,Tmin,Tavg,...,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Distances1,Distances2,Close_to_Centroid2
0,0,249,134,7.0,0,"(41.974689, -87.890615)",1.0,86.0,57.0,72.0,...,0.0,0.0,29.38,30.11,4.7,20.0,6.0,0.0,25.644378,0
1,1,149,74,1.0,0,"(41.731922, -87.677512)",1.0,88.0,60.0,74.0,...,0.0,0.0,29.39,30.11,5.8,18.0,6.5,20.040788,5.69531,0
2,1,156,1,1.0,0,"(41.95469, -87.800991)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,4.818417,22.015745,0
3,1,156,32,1.0,0,"(41.9216, -87.666455)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,12.116638,17.470881,0
4,1,156,35,2.0,0,"(41.867108, -87.654224)",1.0,64.0,47.0,56.0,...,0.0,0.42,29.1,29.79,5.2,5.0,7.6,14.268717,13.660062,0


In [29]:
TEST_CUT = feat_equalize(train_weather, test_weather)
TEST_CUT.head()

Unnamed: 0,Date,Species,Trap,Station,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,...,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Distances1,Distances2,Close_to_Centroid2
0,163,2,1,1,86,61,74.0,56,64.0,0.0,...,0.0,0.0,29.28,29.99,8.9,18,10.0,4.818417,22.015745,0
1,163,3,1,1,86,61,74.0,56,64.0,0.0,...,0.0,0.0,29.28,29.99,8.9,18,10.0,4.818417,22.015745,0
2,163,1,1,1,86,61,74.0,56,64.0,0.0,...,0.0,0.0,29.28,29.99,8.9,18,10.0,4.818417,22.015745,0
3,163,4,1,1,86,61,74.0,56,64.0,0.0,...,0.0,0.0,29.28,29.99,8.9,18,10.0,4.818417,22.015745,0
4,163,6,1,1,86,61,74.0,56,64.0,0.0,...,0.0,0.0,29.28,29.99,8.9,18,10.0,4.818417,22.015745,0


In [30]:
TEST_CUT = TEST_CUT[train_weather.drop(['WnvPresent', 'NumMosquitos', 'LatLong'], axis=1).columns]

In [31]:
print(train_weather.columns.value_counts().sum())
print(train_weather.shape)
print(train_weather.columns)

26
(8475, 26)
Index(['Species', 'Date', 'Trap', 'NumMosquitos', 'WnvPresent', 'LatLong',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Distances1',
       'Distances2', 'Close_to_Centroid2'],
      dtype='object')


In [32]:
print(TEST_CUT.columns.value_counts().sum())
print(TEST_CUT.shape)
print(TEST_CUT.columns)

23
(116293, 23)
Index(['Species', 'Date', 'Trap', 'Station', 'Tmax', 'Tmin', 'Tavg',
       'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'SnowFall',
       'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Distances1', 'Distances2', 'Close_to_Centroid2'],
      dtype='object')


In [77]:
# Predict on Test Data using Balanced Bagging Classifer model

yhat_bbc = pd.DataFrame(best_bbc_gs.predict_proba(TEST_CUT.values))
print(yhat_bbc.describe())
print()
yhat_bbc['WnvPresent'] = yhat_bbc[1].map(lambda x: 1 if x > yhat_bbc[1].mean() else 0)
                                         
print(yhat_bbc.sum())

yhat_bbc = yhat_bbc.drop([0, 1], axis=1)

                   0              1
count  116293.000000  116293.000000
mean        0.683972       0.316028
std         0.164021       0.164021
min         0.250000       0.125000
25%         0.750000       0.250000
50%         0.750000       0.250000
75%         0.750000       0.250000
max         0.875000       0.750000

0              79541.125
1              36751.875
WnvPresent    115628.000
dtype: float64


In [78]:
# Predict on Test Data using XGBoost model

yhat_xgb = pd.DataFrame(best_xgb_gs.predict_proba(TEST_CUT.values))
print(yhat_xgb.describe())
print()
yhat_xgb['WnvPresent'] = yhat_xgb[1].map(lambda x: 1 if x > yhat_xgb[1].mean() else 0)
print(yhat_xgb.sum())

yhat_xgb = yhat_xgb.drop([0, 1], axis=1)

                   0              1
count  116293.000000  116293.000000
mean        0.866907       0.133192
std         0.202927       0.202959
min         0.041385       0.000280
25%         0.942978       0.057022
50%         0.942978       0.057022
75%         0.942978       0.057022
max         0.999720       0.958615

0             100789.316199
1              15503.686023
WnvPresent     15694.000000
dtype: float64


In [76]:
# Predict on Test Data using Random Forest model

yhat_rf = pd.DataFrame(best_rf_gs.predict_proba(TEST_CUT.values))
print(yhat_rf.describe())
print()
yhat_rf['WnvPresent'] = yhat_rf[1].map(lambda x: 1 if x > yhat_rf[1].mean() else 0)
print(yhat_rf.sum())

yhat_rf = yhat_rf.drop([0, 1], axis=1)



                   0              1
count  116293.000000  116293.000000
mean        0.454321       0.545679
std         0.037521       0.037521
min         0.350000       0.300000
25%         0.450000       0.550000
50%         0.450000       0.550000
75%         0.450000       0.550000
max         0.700000       0.650000

0              52834.35
1              63458.65
WnvPresent    113526.00
dtype: float64


In [80]:
# converts prediction output to appropriate kaggle format

def kagglizer(pred):
    pred_format = pd.DataFrame(pred).reset_index()
    pred_format['index'] = pred_format['index']+1
    pred_format = pred_format.rename(columns={'index':'Id',0:'WnvPresent'}).set_index('Id')
    return pred_format

In [81]:
# Submission for XGBoost Model - 0.54499

submission_yhat_xgb = kagglizer(yhat_xgb)

submission_yhat_xgb.to_csv('./submission_yhat_xgb')

In [82]:
# Submission for Balanced Bagging Model - 0.49564

submission_yhat_bbc = kagglizer(yhat_bbc)

submission_yhat_bbc.to_csv('./submission_yhat_bbc')

In [83]:
# Submission for Random Forest Model - 0.61004

submission_yhat_rf = kagglizer(yhat_rf)

submission_yhat_rf.to_csv('./submission_yhat_rf')

In [62]:
def mtrx(model, X, y):
    print('score:')
    print(model.score(X,y))
    print('recall:')
    print(recall_score(y,model.predict(X)))
    print('AUC:')
    print(roc_auc_score(y, model.predict(X)))
    return

In [63]:
mtrx(best_rf_gs, X_test, y_test)

score:
0.8763567720622936
recall:
0.5263157894736842
AUC:
0.7112875705473158


In [64]:
mtrx(best_xgb_gs, X_test, y_test)

score:
0.852760736196319
recall:
0.5877192982456141
AUC:
0.7277748610928819


In [65]:
mtrx(best_bbc_gs, X_test, y_test)

score:
0.8824917413874469
recall:
0.45614035087719296
AUC:
0.6814367589797437
