# Project 4 : West Nile Virus Classification

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.svm import SVC

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
# Import data
train = pd.read_csv('../data/combined_train.csv')
test = pd.read_csv('../data/combined_test.csv')

In [3]:
train.drop('date', axis=1, inplace=True)

## Train Test Split

In [4]:
X = train[[col for col in train != 'wnvpresent']]
y = train['wnvpresent']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

## Baseline Model

In [6]:
# Baseline
y = train['wnvpresent']
y.value_counts(normalize=True)

0    0.947554
1    0.052446
Name: wnvpresent, dtype: float64

## Hyperparameter Tuning with SMOTE

In [7]:
# smt = SMOTE()
# Xsm_train, ysm_train = smt.fit_resample(X_train, y_train)

In [8]:
# Create list to store model testing results
eval_list = []
init_list = []

In [9]:
# Instiantiate models
models = {'lr': LogisticRegression(max_iter=5_000, random_state=42),
          'svc': SVC(random_state=42, probability=True),
          'knn': KNeighborsClassifier(),
          'rf': RandomForestClassifier(random_state=42),
          'dt': DecisionTreeClassifier(random_state=42),
          'et': ExtraTreesClassifier(random_state=42),
          'ada': AdaBoostClassifier(random_state=42),
          'gb': GradientBoostingClassifier(random_state=42)
#           'xgb': xgb.XGBClassifier(random_state=42)
        }

In [10]:
# ==========LR==========
# Matt's LR
lr_params = {
    'lr__penalty':['l2', 'none'],
    'lr__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
    'lr__C':[0.01, 0.1, 1, 10],
    'lr__class_weight':[None, 'balanced']
}

# # Benjamin's LR
# lr_params = {
#     # Trying different types of regularization
#     'lr__penalty':['l2','l1', 'elasticnet'],

#      # Trying different alphas of: 1, 0.1, 0.05  (C = 1/alpha)
#     'lr__C':[1, 10, 20],
# }

# # Elaine's LR
# lr_params = {
#     'lr__penalty':['l1', 'l2'],
#     'lr__solver':['liblinear'],
#     'lr__C':np.logspace(-5, 0, 5),
#     'lr__class_weight':['balanced']
# }

# ==========SVC==========
# # Matt's SVC
# svc_params = {
#     'svc__C':[0.1, 1, 10, 30],
#     'svc__gamma':[0.01, 0.1, 0.3], 
#     'svc__kernel':['linear','rbf','sigmoid','poly','precomputed'],
# }

# Matt's SVC (2)
svc_params = {
    'svc__C':[0.1, 1, 10],
    'svc__gamma':[0.01, 0.1, 0.3], 
    'svc__kernel':['linear','rbf','sigmoid']
}

# # Benjamin's SVC
# svc_params = {
#     'svc__C':[10, 30],
#     'svc__gamma':[0.01, 0.1], 
#     'svc__kernel':['rbf', 'sigmoid'],
# }

# # Esther's SVC
# svc_params = {
#     'svc__C':[0.1, 1, 10],
#     'svc__gamma':[0.01, 0.1, 0.3], 
#     'svc__kernel':['linear','rbf']
# }

# ==========KNN==========
# Matt's KNN
knn_params = {
    'knn__weights':['uniform', 'distance'],
    'knn__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
}

# rf_params = {
#     'rf__n_estimators': [150, 300],
#     'rf__max_depth': [None, 1]
# }

In [11]:
# Function to run model -- input model and params
def run_model(mod, params_dict={}, grid_search=True):
    
    results = {}
    
    pipe = Pipeline([
        ('ss', StandardScaler()),
        ('sampling', SMOTE(random_state = 42)),
        (mod, models[mod])
        ])
    
    if grid_search:
        gs = GridSearchCV(pipe, param_grid = params_dict, cv=5, scoring = 'roc_auc', verbose=1, n_jobs=2)
        gs.fit(X_train, y_train)
        pipe = gs
        
    else:
        pipe.fit(X_train, y_train)
        
    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    
    results['model'] = mod
    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['precision'] = precision_score(y_test, predictions)
    results['specificity'] = tn / (tn + fp)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        eval_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe.best_params_)
        
    else:
        init_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

### Logistic Regression

In [12]:
lr_gs = run_model('lr', params_dict=lr_params, grid_search=True)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
### BEST PARAMS ###


{'lr__C': 0.01,
 'lr__class_weight': None,
 'lr__penalty': 'l2',
 'lr__solver': 'newton-cg'}

### METRICS ###


{'model': 'lr',
 'train_auc': 1.0,
 'test_auc': 1.0,
 'precision': 1.0,
 'specificity': 1.0,
 'recall': 1.0,
 'f_score': 1.0}

True Negatives: 2489
False Positives: 0
False Negatives: 0
True Positives: 138


### SVC

In [14]:
svc_gs = run_model('svc', params_dict=svc_params, grid_search=True)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
### BEST PARAMS ###


{'svc__C': 0.01, 'svc__gamma': 0.01, 'svc__kernel': 'linear'}

### METRICS ###


{'model': 'svc',
 'train_auc': 1.0,
 'test_auc': 1.0,
 'precision': 1.0,
 'specificity': 1.0,
 'recall': 1.0,
 'f_score': 1.0}

True Negatives: 2489
False Positives: 0
False Negatives: 0
True Positives: 138


### KNN

In [None]:
knn_gs = run_model('knn', params_dict=knn_params, grid_search=True)

In [None]:
asd

## Model Evaluation Summary

In [None]:
eval_df = pd.DataFrame(eval_list)

In [None]:
eval_df.sort_values(by=['test', 'roc'], ascending=False).reset_index(drop=True)