In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as Pipeline1
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_rows', 500)

In [3]:
# Import datasets
train = pd.read_csv('../assets/train.csv')
test = pd.read_csv('../assets/test.csv')
#weather = pd.read_csv('../assets/cleaned_weather.csv')
combined_train = pd.read_csv('../assets/combined_train.csv')
combined_test = pd.read_csv('../assets/combined_test.csv')

In [4]:
test_id = combined_test['id']

In [5]:
X = combined_train.drop(columns = ['wnvpresent', 'date'])
y = combined_train['wnvpresent']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [7]:
# Instiantiate models
models = {'lr': LogisticRegression(max_iter=5_000, random_state=42),
          'svc': SVC(random_state=42, probability=True),
          'knn': KNeighborsClassifier(),
          'rf': RandomForestClassifier(random_state=42),
          'dt': DecisionTreeClassifier(random_state=42),
          'et': ExtraTreesClassifier(random_state=42),
          'ada': AdaBoostClassifier(random_state=42),
          'gb': GradientBoostingClassifier(random_state=42),
#          'xgb': xgb.XGBClassifier(random_state=42),
        }

In [36]:
dt_params = {'dt__max_depth': [2, 3, 5, 7, 10],
            'dt__min_samples_split': [5, 10, 15, 20],
            'dt__min_samples_leaf': [2, 3, 4, 5, 6, 7],
            'dt__class_weight' : ['balanced']    
}

# dt_params = {'dt__max_depth': [2, 5, 7],
#             'dt__min_samples_split': [2, 5, 10],
#             'dt__min_samples_leaf': [3, 5, 7, 9],
#             'dt__class_weight' : ['balanced']    
# }

rf_params = {'rf__n_estimators': [10, 20, 50, 100, 150, 200],
             'rf__max_depth': [5, 10, 15, 20, 25],
             'rf__min_samples_leaf': [2, 5, 10],
             'rf__class_weight' : ['balanced', 'balanced_subsample']
}
    
et_params = {'et__n_estimators': [10, 20, 50, 100, 150, 200],
            'et__max_depth': [2, 3, 4, 5, 10, 20],
            'et__min_samples_leaf': [2, 5, 10],
            'et__class_weight' : ['balanced', 'balanced_subsample'],
}

# lr_params = {
#     # Trying different types of regularization
#     'lr__penalty':['l2','l1', 'elasticnet'],

#      # Trying different alphas of: 1, 0.1, 0.05  (C = 1/alpha)
#     'lr__C':[1, 10, 20],
# }

lr_params = {'lr__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
             'lr__C':[0.01, 0.1, 1, 10],
             'lr__class_weight':[None, 'balanced']
            }

# svc_params = {'svc__C':[0.1, 1, 10, 30],
#               'svc__gamma':[0.01, 0.1, 0.3],
#               'svc__kernel':['linear','rbf','sigmoid','poly','precomputed']
#              }

              
svc_params2 = {'svc__C':[0.1, 1, 10],
               'svc__gamma':[0.01, 0.1, 0.3],
               'svc__kernel':['linear','rbf','sigmoid']
              }

knn_params = {'knn__weights':['uniform', 'distance'],
              'knn__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
             }

In [19]:
# Instantiate lists to store results
#init_list = []
gs_list = []

In [20]:
# Function to run model -- input model and params
def run_model(mod, params_dict={}, grid_search=True):
    
    results = {}
    
    pipe = Pipeline([
            ('ss', StandardScaler()),
            (mod, models[mod])
            ])
    
    if grid_search:
        gs = GridSearchCV(pipe, param_grid = params_dict, cv=5, scoring = 'roc_auc', verbose=1, n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs
        
    else:
        pipe.fit(X_train, y_train)
        
    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    
    results['model'] = mod
    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['precision'] = precision_score(y_test, predictions)
    results['specificity'] = tn / (tn + fp)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        gs_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe.best_params_)
        
    else:
        init_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

In [21]:
# Function to run model -- input model and params
def run_model_smote(mod, params_dict={}, grid_search=True):
    
    results = {}
    
    pipe = Pipeline1([
        ('ss', StandardScaler()),
        ('sampling', SMOTE(random_state = 42)),  # set random_state so our score does not change
        (mod, models[mod])
    ])
    
    if grid_search:
        gs = GridSearchCV(pipe, param_grid = params_dict, cv=5, scoring = 'roc_auc', verbose=1, n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs
        
    else:
        pipe.fit(X_train, y_train)
        
    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    
    results['model'] = mod
    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['precision'] = precision_score(y_test, predictions)
    results['specificity'] = tn / (tn + fp)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        gs_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe.best_params_)
        
    else:
        init_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

In [12]:
%%time
run_model('dt', params_dict= dt_params, grid_search=True)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
### BEST PARAMS ###


{'dt__class_weight': 'balanced',
 'dt__max_depth': 5,
 'dt__min_samples_leaf': 7,
 'dt__min_samples_split': 5}

### METRICS ###


{'model': 'dt',
 'train_auc': 0.8453877108103953,
 'test_auc': 0.7842405133311207,
 'precision': 0.1264236902050114,
 'specificity': 0.691844114102049,
 'recall': 0.8043478260869565,
 'f_score': 0.21850393700787404}

True Negatives: 1722
False Positives: 767
False Negatives: 27
True Positives: 111


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('dt',
                                        DecisionTreeClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'dt__class_weight': ['balanced'],
                         'dt__max_depth': [2, 3, 5, 7, 10],
                         'dt__min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'dt__min_samples_split': [5, 10, 15, 20]},
             scoring='roc_auc', verbose=1)

In [13]:
run_model_smote('dt', params_dict= dt_params, grid_search=True)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
### BEST PARAMS ###


{'dt__class_weight': 'balanced',
 'dt__max_depth': 7,
 'dt__min_samples_leaf': 3,
 'dt__min_samples_split': 20}

### METRICS ###


{'model': 'dt',
 'train_auc': 0.8532950667724354,
 'test_auc': 0.7887269201879574,
 'precision': 0.14088820826952528,
 'specificity': 0.7746082764162314,
 'recall': 0.6666666666666666,
 'f_score': 0.23261694058154236}

True Negatives: 1928
False Positives: 561
False Negatives: 46
True Positives: 92


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('sampling', SMOTE(random_state=42)),
                                       ('dt',
                                        DecisionTreeClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'dt__class_weight': ['balanced'],
                         'dt__max_depth': [2, 3, 5, 7, 10],
                         'dt__min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'dt__min_samples_split': [5, 10, 15, 20]},
             scoring='roc_auc', verbose=1)

In [28]:
%%time
run_model('et', params_dict= et_params, grid_search=True)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
### BEST PARAMS ###


{'et__class_weight': 'balanced',
 'et__max_depth': 10,
 'et__min_samples_leaf': 5,
 'et__n_estimators': 50}

### METRICS ###


{'model': 'et',
 'train_auc': 0.8981834680414003,
 'test_auc': 0.816262569799873,
 'precision': 0.1361031518624642,
 'specificity': 0.7577340297308156,
 'recall': 0.6884057971014492,
 'f_score': 0.22727272727272727}

True Negatives: 1886
False Positives: 603
False Negatives: 43
True Positives: 95
CPU times: user 3.92 s, sys: 598 ms, total: 4.52 s
Wall time: 2min 27s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('et',
                                        ExtraTreesClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'et__class_weight': ['balanced', 'balanced_subsample'],
                         'et__max_depth': [2, 3, 4, 5, 10, 20],
                         'et__min_samples_leaf': [2, 5, 10],
                         'et__n_estimators': [10, 20, 50, 100, 150, 200]},
             scoring='roc_auc', verbose=1)

In [29]:
%%time
run_model_smote('et', params_dict= et_params, grid_search=True)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
### BEST PARAMS ###


{'et__class_weight': 'balanced',
 'et__max_depth': 10,
 'et__min_samples_leaf': 2,
 'et__n_estimators': 150}

### METRICS ###


{'model': 'et',
 'train_auc': 0.8854169571954604,
 'test_auc': 0.8176192638915576,
 'precision': 0.144,
 'specificity': 0.7850542386500603,
 'recall': 0.6521739130434783,
 'f_score': 0.23591087811271294}

True Negatives: 1954
False Positives: 535
False Negatives: 48
True Positives: 90
CPU times: user 6.43 s, sys: 844 ms, total: 7.27 s
Wall time: 6min 39s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('sampling', SMOTE(random_state=42)),
                                       ('et',
                                        ExtraTreesClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'et__class_weight': ['balanced', 'balanced_subsample'],
                         'et__max_depth': [2, 3, 4, 5, 10, 20],
                         'et__min_samples_leaf': [2, 5, 10],
                         'et__n_estimators': [10, 20, 50, 100, 150, 200]},
             scoring='roc_auc', verbose=1)

In [30]:
%%time
run_model('rf', params_dict= rf_params, grid_search=True)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
### BEST PARAMS ###


{'rf__class_weight': 'balanced_subsample',
 'rf__max_depth': 10,
 'rf__min_samples_leaf': 5,
 'rf__n_estimators': 150}

### METRICS ###


{'model': 'rf',
 'train_auc': 0.9276325476137506,
 'test_auc': 0.8225118055676862,
 'precision': 0.15267175572519084,
 'specificity': 0.8216151064684613,
 'recall': 0.5797101449275363,
 'f_score': 0.2416918429003021}

True Negatives: 2045
False Positives: 444
False Negatives: 58
True Positives: 80
CPU times: user 4.49 s, sys: 793 ms, total: 5.28 s
Wall time: 3min 56s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('rf',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__class_weight': ['balanced', 'balanced_subsample'],
                         'rf__max_depth': [5, 10, 15, 20, 25],
                         'rf__min_samples_leaf': [2, 5, 10],
                         'rf__n_estimators': [10, 20, 50, 100, 150, 200]},
             scoring='roc_auc', verbose=1)

In [31]:
%%time
run_model_smote('rf', params_dict= rf_params, grid_search=True)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
### BEST PARAMS ###


{'rf__class_weight': 'balanced',
 'rf__max_depth': 10,
 'rf__min_samples_leaf': 2,
 'rf__n_estimators': 100}

### METRICS ###


{'model': 'rf',
 'train_auc': 0.9032052649979341,
 'test_auc': 0.813914557385831,
 'precision': 0.15051903114186851,
 'specificity': 0.8027320208919244,
 'recall': 0.6304347826086957,
 'f_score': 0.2430167597765363}

True Negatives: 1998
False Positives: 491
False Negatives: 51
True Positives: 87
CPU times: user 5.09 s, sys: 655 ms, total: 5.75 s
Wall time: 9min 5s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('sampling', SMOTE(random_state=42)),
                                       ('rf',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__class_weight': ['balanced', 'balanced_subsample'],
                         'rf__max_depth': [5, 10, 15, 20, 25],
                         'rf__min_samples_leaf': [2, 5, 10],
                         'rf__n_estimators': [10, 20, 50, 100, 150, 200]},
             scoring='roc_auc', verbose=1)

In [26]:
%%time
run_model('lr', params_dict= lr_params, grid_search=True)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




### BEST PARAMS ###


{'lr__C': 10,
 'lr__class_weight': 'balanced',
 'lr__penalty': 'l2',
 'lr__solver': 'newton-cg'}

### METRICS ###


{'model': 'lr',
 'train_auc': 0.8397969098330511,
 'test_auc': 0.8092942279362528,
 'precision': 0.13636363636363635,
 'specificity': 0.7404580152671756,
 'recall': 0.7391304347826086,
 'f_score': 0.23024830699774265}

True Negatives: 1843
False Positives: 646
False Negatives: 36
True Positives: 102
CPU times: user 1.38 s, sys: 234 ms, total: 1.61 s
Wall time: 2min 25s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('lr',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid={'lr__C': [0.01, 0.1, 1, 10],
                         'lr__class_weight': [None, 'balanced'],
                         'lr__penalty': ['l2'],
                         'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
             scoring='roc_auc', verbose=1)

In [27]:
%%time
run_model_smote('lr', params_dict= lr_params, grid_search=True)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
### BEST PARAMS ###


{'lr__C': 10,
 'lr__class_weight': None,
 'lr__penalty': 'l2',
 'lr__solver': 'lbfgs'}

### METRICS ###


{'model': 'lr',
 'train_auc': 0.8394284922966357,
 'test_auc': 0.8068865326276196,
 'precision': 0.1323529411764706,
 'specificity': 0.739252711932503,
 'recall': 0.717391304347826,
 'f_score': 0.2234762979683973}

True Negatives: 1840
False Positives: 649
False Negatives: 39
True Positives: 99
CPU times: user 4.46 s, sys: 270 ms, total: 4.73 s
Wall time: 1min 17s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('sampling', SMOTE(random_state=42)),
                                       ('lr',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid={'lr__C': [0.01, 0.1, 1, 10],
                         'lr__class_weight': [None, 'balanced'],
                         'lr__penalty': ['l2'],
                         'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
             scoring='roc_auc', verbose=1)

In [47]:
%%time
run_model('svc', params_dict= svc_params2, grid_search=True)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

In [45]:
%%time
run_model_smote('svc', params_dict= svc_params2, grid_search=True)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

In [44]:
%%time
run_model('knn', params_dict= knn_params, grid_search=True)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
### BEST PARAMS ###


{'knn__algorithm': 'kd_tree', 'knn__weights': 'uniform'}

### METRICS ###


{'model': 'knn',
 'train_auc': 0.9372047551807094,
 'test_auc': 0.7101856865861966,
 'precision': 0.3333333333333333,
 'specificity': 0.9887505022097228,
 'recall': 0.10144927536231885,
 'f_score': 0.15555555555555556}

True Negatives: 2461
False Positives: 28
False Negatives: 124
True Positives: 14
CPU times: user 863 ms, sys: 48.8 ms, total: 912 ms
Wall time: 8.01 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
                                            'brute'],
                         'knn__weights': ['uniform', 'distance']},
             scoring='roc_auc', verbose=1)

In [43]:
%%time
run_model_smote('knn', params_dict= knn_params, grid_search=True)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
### BEST PARAMS ###


{'knn__algorithm': 'kd_tree', 'knn__weights': 'uniform'}

### METRICS ###


{'model': 'knn',
 'train_auc': 0.9482645782754298,
 'test_auc': 0.7087940561659709,
 'precision': 0.16030534351145037,
 'specificity': 0.8674166331860185,
 'recall': 0.45652173913043476,
 'f_score': 0.2372881355932203}

True Negatives: 2159
False Positives: 330
False Negatives: 75
True Positives: 63
CPU times: user 1.05 s, sys: 165 ms, total: 1.21 s
Wall time: 13.4 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('sampling', SMOTE(random_state=42)),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
                                            'brute'],
                         'knn__weights': ['uniform', 'distance']},
             scoring='roc_auc', verbose=1)

In [34]:
tuning_df = pd.DataFrame(gs_list)
tuning_df

Unnamed: 0,model,train_auc,test_auc,precision,specificity,recall,f_score
0,lr,0.839797,0.809294,0.136364,0.740458,0.73913,0.230248
1,lr,0.839428,0.806887,0.132353,0.739253,0.717391,0.223476
2,et,0.898183,0.816263,0.136103,0.757734,0.688406,0.227273
3,et,0.885417,0.817619,0.144,0.785054,0.652174,0.235911
4,rf,0.927633,0.822512,0.152672,0.821615,0.57971,0.241692
5,rf,0.903205,0.813915,0.150519,0.802732,0.630435,0.243017


In [35]:
tuning_df.sort_values(by=['test_auc', 'recall'], ascending=False).reset_index(drop=True)

Unnamed: 0,model,train_auc,test_auc,precision,specificity,recall,f_score
0,rf,0.927633,0.822512,0.152672,0.821615,0.57971,0.241692
1,et,0.885417,0.817619,0.144,0.785054,0.652174,0.235911
2,et,0.898183,0.816263,0.136103,0.757734,0.688406,0.227273
3,rf,0.903205,0.813915,0.150519,0.802732,0.630435,0.243017
4,lr,0.839797,0.809294,0.136364,0.740458,0.73913,0.230248
5,lr,0.839428,0.806887,0.132353,0.739253,0.717391,0.223476
