# Covid Research 20201123

***
## 0. Import

In [158]:
import GAN
from GAN import *

import gc
import hdbscan
import importlib
import io
import math
import numpy as np
import operator
import pandas as pd
import psutil
from matplotlib import pyplot as plt

from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler, OrdinalEncoder, LabelEncoder

seed = 47

***
## 1. Read Data

In [159]:
data = pd.read_excel('ed_pred.xlsx')
data = data.reset_index()
data.head()

Unnamed: 0,index,patno,Admitted,AdmittingDepartment,COVIDResult,Age,FirstRace,Ethnicity,Sex,heart_rate,...,cmp_bicarbonate,cmp_bun,cmp_creatinine,cmp_glucose,cmp_alt,cmp_ast,cmp_alkaline_phosphatase,cmp_total_protein,cmp_albumin,cmp_bilirubin
0,0,1,1,UVHE MICU,None Detected,78,White or Caucasian,Non-Hispanic,Female,94.0,...,26,31,2.5,82.0,14,26.0,80.0,8.5,4.3,0.5
1,1,2,0,,None Detected,23,White or Caucasian,Non-Hispanic,Female,121.0,...,19,11,0.9,83.0,73,,100.0,8.1,4.6,0.6
2,2,3,0,,None Detected,55,African American,Non-Hispanic,Male,83.0,...,23,9,1.2,123.0,26,29.0,106.0,7.1,4.0,0.7
3,3,4,1,UVHE MICU,None Detected,50,White or Caucasian,Non-Hispanic,Male,88.0,...,30,45,1.6,297.0,22,,78.0,7.8,3.7,0.5
4,4,5,0,,None Detected,67,African American,Non-Hispanic,Female,90.0,...,27,25,1.0,96.0,12,18.0,122.0,7.4,3.5,0.2


In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7380 entries, 0 to 7379
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     7380 non-null   int64  
 1   patno                     7380 non-null   int64  
 2   Admitted                  7380 non-null   int64  
 3   AdmittingDepartment       3003 non-null   object 
 4   COVIDResult               7380 non-null   object 
 5   Age                       7380 non-null   int64  
 6   FirstRace                 7379 non-null   object 
 7   Ethnicity                 7380 non-null   object 
 8   Sex                       7380 non-null   object 
 9   heart_rate                7352 non-null   float64
 10  sbp                       7292 non-null   float64
 11  dbp                       7292 non-null   float64
 12  pulse_ox                  7346 non-null   float64
 13  resp_rate                 7312 non-null   float64
 14  height  

***
## 2. Preprocess

### 2.1 Prepare

In [161]:
feature_cols = ['Age', 'FirstRace', 'Ethnicity', 'Sex', 'height', 'wght', 'heart_rate', 'sbp', 'dbp', 'pulse_ox', 'resp_rate', 'cmp_glucose']
label_cols = ['COVIDResult']

cat_cols = ['FirstRace', 'Ethnicity', 'Sex']
num_cols = feature_cols.copy()
for i in cat_cols:
    num_cols.remove(i)

data_w_features = data[feature_cols + label_cols]
train, test = train_test_split(data_w_features, test_size=0.2, random_state=seed)

### 2.2 Preprocessing

In [162]:
def numerical_imputer_preprocess(x_num):
    for i in range(len(x_num)):
        for j in range(len(x_num[i])):
            if isinstance(x_num[i][j], str):
                if (ord('0') <= ord(x_num[i][j][0]) <= ord('9')):
                    x_num[i][j] = float(x_num[i][j])
                else:
                    x_num[i][j] = float(x_num[i][j][1:])
                    
    imp = SimpleImputer(strategy='constant', fill_value=0)
    x_num = imp.fit_transform(x_num)
    
    return x_num

def numerical_normalizer_process(x_num):
    norm = Normalizer()
    x_num = norm.fit_transform(x_num)
    return x_num

def numerical_preprocess(x_num):
    x_num = numerical_imputer_preprocess(x_num)
    x_num = numerical_imputer_preprocess(x_num)
    x_num = numerical_normalizer_process(x_num)
    
    return x_num
    
def categorical_preprocess(x_cat):
    for i in range(len(x_cat)):
        if not isinstance(x_cat[i][0], str):
            x_cat[i][0] = 'NA'
        if not isinstance(x_cat[i][1], str):
            x_cat[i][1] = '*Unspecified'
            
    oe = OrdinalEncoder()
    x_cat = oe.fit_transform(x_cat)
    
    return x_cat
    
def y_preprocess(y):
    le = LabelEncoder()
    y = le.fit_transform(y)
    for i in range(len(y)):
        y[i] = 1 - y[i]
    
    return y

def preprocess(dataset):
    x_num = dataset[num_cols].values
    x_cat = dataset[cat_cols].values

    x_num = numerical_preprocess(x_num)
    x_cat = categorical_preprocess(x_cat)
    
    x = np.concatenate((x_num, x_cat), axis=1)
    y = y_preprocess(dataset[label_cols].values.ravel())
    
    return x, y

x_train, y_train = preprocess(train)
x_test, y_test = preprocess(test)

***
## 3. Logistic Regression

In [163]:
def evaluation(y_test, y_pred):
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
    print("Precision: {:.2f}%".format(precision_score(y_test, y_pred) * 100))
    print("Recall: {:.2f}%".format(recall_score(y_test, y_pred) * 100))
    print("F1 score: {:.2f}%".format(f1_score(y_test, y_pred) * 100))

lr = LogisticRegression(random_state=seed, max_iter=500)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
evaluation(y_test, y_pred)

Confusion matrix:
[[1262  114]
 [  54   46]]
Accuracy: 88.62%
Precision: 28.75%
Recall: 46.00%
F1 score: 35.38%


---
## 4. Bagging

### 4.1 Bagging Classifier with Logistic Regression

In [164]:
clf = BaggingClassifier(base_estimator=LogisticRegression(random_state=seed, max_iter=500), random_state=seed)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
evaluation(y_test, y_pred)

Confusion matrix:
[[1266  110]
 [  54   46]]
Accuracy: 88.89%
Precision: 29.49%
Recall: 46.00%
F1 score: 35.94%


### 4.2 Balanced Bagging Classifier with Logistic Regression

In [165]:
bbc = BalancedBaggingClassifier(base_estimator=LogisticRegression(random_state=seed, max_iter=500),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=seed)
bbc.fit(x_train, y_train)
y_pred = bbc.predict(x_test)
evaluation(y_test, y_pred)

Confusion matrix:
[[  17 1359]
 [   0  100]]
Accuracy: 7.93%
Precision: 6.85%
Recall: 100.00%
F1 score: 12.83%


---
## 5. Boosting

### 5.1 Gradient Boosting Classifier

In [166]:
clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=1.0,
                                 max_depth=1,
                                 random_state=seed)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
evaluation(y_test, y_pred)

Confusion matrix:
[[1055  321]
 [  77   23]]
Accuracy: 73.04%
Precision: 6.69%
Recall: 23.00%
F1 score: 10.36%
