# Modeling

In [84]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_validate, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve,\
precision_recall_curve, f1_score, fbeta_score, confusion_matrix, classification_report, make_scorer
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter, OrderedDict
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import time
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline
plt.style.use('ggplot')
sns.set_style("white")

In [85]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [83]:
# For auto-reloading updated modules
%load_ext autoreload
%autoreload 2

In [87]:
# Import modules
from data_cleaning import convert
from model_eval import *

In [66]:
# Load data
import pickle

infile = open('merged_df2.pkl', 'rb')
df = pickle.load(infile)
infile.close()

df.head()
df.shape

Unnamed: 0,death,date,cdc_report_dt,onset_dt,age_group,hosp,icu,med_cond,"American Indian/Alaska Native, Non-Hispanic","Asian, Non-Hispanic",...,vent_incr,positive,pos_incr,pos_rate,pos_incr_rate,negative,neg_incr,recovered,tot_test_results,tot_test_results_incr
0,0,2020-04-01,2020-03-27,2020-03-27,3.0,0.0,,,0,0,...,,224086.0,25791.0,0.180594,0.232918,984489.0,82981.0,7084.0,1240829.0,110730.0
1,0,2020-04-01,2020-03-19,2020-03-19,3.0,,,,0,0,...,,224086.0,25791.0,0.180594,0.232918,984489.0,82981.0,7084.0,1240829.0,110730.0
2,0,2020-04-01,2020-03-30,2020-03-30,3.0,0.0,,,0,0,...,,224086.0,25791.0,0.180594,0.232918,984489.0,82981.0,7084.0,1240829.0,110730.0
3,0,2020-04-01,2020-04-01,2020-04-01,3.0,0.0,,,0,0,...,,224086.0,25791.0,0.180594,0.232918,984489.0,82981.0,7084.0,1240829.0,110730.0
4,0,2020-04-01,2020-04-14,2020-03-31,3.0,0.0,,0.0,0,0,...,,224086.0,25791.0,0.180594,0.232918,984489.0,82981.0,7084.0,1240829.0,110730.0


(671435, 45)

In [67]:
# Sample of df for imputing & testing models
df1 = df.sample(50000)

In [68]:
df1['death'].value_counts()

0    46717
1     3283
Name: death, dtype: int64

### Choosing Features

We've done some extensive feature analysis. From looking at correlations and this analysis, I will start with these features:

In [70]:
feat = ['hosp', 'age_group', 'med_cond', 'icu', 'pos_rate', 'pos_incr_rate', 'calc_pct_occ', 'month', 'Male']

df2 = df1[['death', 'hosp', 'age_group', 'icu', 'med_cond', 'pos_rate', 'pos_incr_rate', 'calc_pct_occ', 'month', 'Male']]

In [89]:
# Look at null values in this subset
df2.isna().sum()

death                0
hosp             12593
age_group           12
icu              30444
med_cond         17705
pos_rate             0
pos_incr_rate        0
calc_pct_occ         0
month                0
Male                 0
dtype: int64

In [71]:
# Impute null values with a constant, 0.5
df2_imputed = df2.fillna(0.5)
df2_imputed.isna().sum()

death            0
hosp             0
age_group        0
icu              0
med_cond         0
pos_rate         0
pos_incr_rate    0
calc_pct_occ     0
month            0
Male             0
dtype: int64

In [72]:
# Impute null values with KNNImputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
filled = imputer.fit_transform(df2)
df_filled = pd.DataFrame(filled, columns=df2.columns)
df_filled.isna().sum()

death            0
hosp             0
age_group        0
icu              0
med_cond         0
pos_rate         0
pos_incr_rate    0
calc_pct_occ     0
month            0
Male             0
dtype: int64

*Partition data*

I will reserve a portion of my data as a hold-out set here and not touch it until I am done with all of my modeling.

In [73]:
X_knni, y_knni = df_filled.iloc[:,1:], df_filled.iloc[:,0]

In [74]:
X_ci, y_ci = df2_imputed.iloc[:,1:], df2_imputed.iloc[:,0]

## Determining which models to use

I will try several different models for a baseline using the features that I selected through my earlier feature analysis. Since my classes are imbalanced, I will randomly oversample from the minority class before I run the models. I will use the default features for the models for now.

## Trying different preliminary models

In [15]:
models = {'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000),
          'KNN': KNeighborsClassifier(n_neighbors=5),
          'Naive Bayes': BernoulliNB(),
         'Decision Tree': DecisionTreeClassifier(),
         'Random Forest': RandomForestClassifier(),
         'XGBoost': XGBClassifier()}

In [88]:
# Evaluate scores for models using data imputed with KNNImputer
for model_name, model in models.items():
    eval(model_name, model, X_knni, y_knni)



Logistic Regression :
Accuracy score:  0.918375
Precision score:  0.4421338155515371
Recall score:  0.9314285714285714
F1 score:  0.5996321275291232
F-beta score:  0.7626325639426076
ROC-AUC score:  0.9763434941869724 

KNN :
Accuracy score:  0.9395
Precision score:  0.5235361653272101
Recall score:  0.8685714285714285
F1 score:  0.6532951289398281
F-beta score:  0.7674183776506227
ROC-AUC score:  0.9378339225991399 

Naive Bayes :
Accuracy score:  0.90925
Precision score:  0.4041944709246902
Recall score:  0.8076190476190476
F1 score:  0.5387547649301143
F-beta score:  0.6732295966973643
ROC-AUC score:  0.9187915273132663 

Decision Tree :
Accuracy score:  0.94825
Precision score:  0.581021897810219
Recall score:  0.758095238095238
F1 score:  0.6578512396694214
F-beta score:  0.7145421903052063
ROC-AUC score:  0.8685864628125497 

Random Forest :
Accuracy score:  0.9485
Precision score:  0.57991513437058
Recall score:  0.780952380952381
F1 score:  0.6655844155844156
F-beta score:  0.7

In [76]:
# Evaluate scores for models using data imputed with constant
for model_name, model in models.items():
    eval(model_name, model, X_ci, y_ci)



Logistic Regression :
Accuracy score:  0.899875
Precision score:  0.38995215311004783
Recall score:  0.9314285714285714
F1 score:  0.5497470489038785
F-beta score:  0.7289803220035778
ROC-AUC score:  0.969357795827361 

KNN :
Accuracy score:  0.924375
Precision score:  0.45698924731182794
Recall score:  0.8095238095238095
F1 score:  0.584192439862543
F-beta score:  0.7013201320132012
ROC-AUC score:  0.9078923395445134 

Naive Bayes :
Accuracy score:  0.77675
Precision score:  0.2007593735168486
Recall score:  0.8057142857142857
F1 score:  0.32142857142857145
F-beta score:  0.5027335393391965
ROC-AUC score:  0.8295336200031852 

Decision Tree :
Accuracy score:  0.93775
Precision score:  0.5216
Recall score:  0.6209523809523809
F1 score:  0.5669565217391304
F-beta score:  0.5981651376146788
ROC-AUC score:  0.7931438764134415 

Random Forest :
Accuracy score:  0.9425
Precision score:  0.5508607198748043
Recall score:  0.6704761904761904
F1 score:  0.6048109965635738
F-beta score:  0.64257

In [77]:
# Comparing to XGBoost sample run on non-imputed data with oversampling
model_scores_os(XGBClassifier(), df2.iloc[:,1:], df2.iloc[:,0])

Accuracy score:  0.9291
Precision score:  0.47865353037766833
Recall score:  0.8873668188736682
F1 score:  0.6218666666666667
Fbeta score (beta=2):  0.7579303172126886
ROC AUC score:  0.9682380495999658 



In [78]:
# Comparing to XGBoost sample run on non-imputed data with built-in weighting
model_scores(XGBClassifier(scale_pos_weight=14), df2.iloc[:,1:], df2.iloc[:,0])

Accuracy score:  0.9288
Precision score:  0.4772162386081193
Recall score:  0.8767123287671232
F1 score:  0.6180257510729613
Fbeta score (beta=2):  0.7509778357235983
ROC AUC score:  0.9669877952564132 



In [79]:
# Comparing to XGBoost run on ALL data with built-in weighting
model_scores(XGBClassifier(scale_pos_weight=14), df[feat], df['death'])

Accuracy score:  0.9128136007208442
Precision score:  0.4240967591874709
Recall score:  0.9354691597309315
F1 score:  0.5836119211892737
Fbeta score (beta=2):  0.7537065275302676
ROC AUC score:  0.9719762057660932 



In [81]:
# Comparing to XGBoost run on ALL data with oversampling
model_scores_os(XGBClassifier(), df[feat], df['death'])

Accuracy score:  0.9122402019555131
Precision score:  0.4225352112676056
Recall score:  0.9371793410101471
F1 score:  0.5824623560673162
Fbeta score (beta=2):  0.7536029924089624
ROC AUC score:  0.9717061614621211 

