## Read the feature selected data frame via mutual information

In [3]:
import pandas as pd
import numpy as np
import time

np.random.seed(33)
df = pd.read_csv("../data/featureSelectedAllDataWithY.csv")
# df = pd.read_csv("../data/Cleaned_dat_encoded.csv")
print(df.shape)
df.head()

(534730, 21)


Unnamed: 0,esi,age,ethnicity,race,lang,maritalstatus,employstatus,insurance_status,arrivalmode,previousdispo,...,meds_analgesics,meds_antiplateletdrugs,meds_cardiacdrugs,meds_cardiovascular,meds_diuretics,meds_elect/caloric/h2o,meds_gastrointestinal,meds_psychotherapeuticdrugs,meds_vitamins,disposition
0,4,40,1,8,1,8,2,4,6,7,...,0,0,0,0,0,0,0,0,0,2
1,4,66,1,4,1,5,3,1,2,7,...,0,0,0,0,0,0,0,0,0,2
2,2,66,1,4,1,5,3,1,6,3,...,0,0,0,0,0,0,0,0,0,2
3,2,66,1,4,1,5,3,1,2,3,...,0,0,0,0,0,0,0,0,0,2
4,3,84,1,5,2,10,6,3,6,3,...,0,0,0,2,1,2,2,0,1,1


## Convert the label to 0,1 format

In [4]:
df['disposition'].replace(1,0,inplace=True)
df['disposition'].replace(2,1,inplace=True)

print(df['disposition'])
## Split the data into training and testing data
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)

0         1
1         1
2         1
3         1
4         0
         ..
534725    0
534726    0
534727    1
534728    0
534729    0
Name: disposition, Length: 534730, dtype: int64


In [5]:
from sklearn.preprocessing import StandardScaler
y_train = training_data['disposition']
y_test = testing_data['disposition']
X_train = StandardScaler().fit_transform(training_data.drop("disposition",axis = 1))
X_test = StandardScaler().fit_transform(testing_data.drop("disposition",axis = 1))

y_train = np.array(y_train)
y_test = np.array(y_test)
X_train = np.array(X_train)
X_test = np.array(X_test)
# y_train = np.array(y_train)[:100]
# y_test = np.array(y_test)[:10]
# X_train = np.array(X_train)[:100,]
# X_test = np.array(X_test)[:10,]

## Create a DNN model 

In [6]:
def create_model(optimizer='adam'):
    model = Sequential([
        Dense(30, activation='relu',input_shape=(20,)),
        Dense(30, activation='relu'),
        Dense(30, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy' ,weighted_metrics=['accuracy'])
    return model

## Train with parameters in the paper

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score
np.random.seed(52)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow_core.python.keras.layers import Dense
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# batch_size = best_clf.best_params_['batch_size']
# epochs = best_clf.best_params_['epochs']
# optimizer = best_clf.best_params_['optimizer']

## Identified the results
batch_size = 128
epochs = 30
optimizer = 'RMSprop'

class_weight ={
    1:0.445,
    0:1
}

clf = create_model(optimizer)
history = clf.fit(X_train, y_train,batch_size=batch_size,epochs=epochs,class_weight=class_weight)
clf.save("dnn.h5")
# clf.fit(X_train, y_train,batch_size=batch_size,epochs=epochs)
y_pred = clf.predict_classes(X_test)
print(y_pred)

mylist = []
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

Train on 427784 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[1]
 [0]
 [1]
 ...
 [1]
 [1]
 [0]]
[[25447  7388]
 [10675 63436]]
0.8311016774820937


## Visualize the matrix results

In [8]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score, average_precision_score
from prettytable import PrettyTable
accuracy_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
MCCs = []
auROCs = []
auPRCs = []

accuracy_scores.append(accuracy_score(y_true=y_test, y_pred=y_pred))
f1_scores.append(f1_score(y_true=y_test, y_pred=y_pred))
recall_scores.append(recall_score(y_true=y_test, y_pred=y_pred))
precision_scores.append(precision_score(y_true=y_test, y_pred=y_pred))
MCCs.append(matthews_corrcoef(y_true=y_test, y_pred=y_pred))
auROCs.append(roc_auc_score(y_true=y_test, y_score=clf.predict(X_test)))
auPRCs.append(average_precision_score(y_true=y_test,  y_score=clf.predict(X_test)))

table = PrettyTable()
column_names = ['Accuracy', 'auROC', 'auPRC', 'recall', 'precision', 'f1', 'MCC']
table.add_column(column_names[0], np.round(accuracy_scores, 4))
table.add_column(column_names[1], np.round(auROCs, 4))
table.add_column(column_names[2], np.round(auPRCs, 4))
table.add_column(column_names[3], np.round(recall_scores, 4))
table.add_column(column_names[4], np.round(precision_scores, 4))
table.add_column(column_names[5], np.round(f1_scores, 4))
table.add_column(column_names[6], np.round(MCCs, 4))
print(table)

+----------+--------+--------+--------+-----------+--------+--------+
| Accuracy | auROC  | auPRC  | recall | precision |   f1   |  MCC   |
+----------+--------+--------+--------+-----------+--------+--------+
|  0.8311  | 0.9035 | 0.9508 | 0.856  |   0.8957  | 0.8754 | 0.6154 |
+----------+--------+--------+--------+-----------+--------+--------+


## Grid Search for DNN

In [88]:
batch_size = [16, 32, 64]
epochs = [20, 50, 80, 110]
optimizer = ['SGD', 'Adam','RMSprop']
param_grid = dict(batch_size=batch_size,
                      epochs=epochs,
                      optimizer=optimizer)

In [15]:
best_clf = clf.fit(X_train, y_train)np.random.seed(52)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow_core.python.keras.layers import Dense
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# create model
model =KerasClassifier(build_fn=create_model,verbose=0)

## add the compile method for the neural network

clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,n_jobs=-1)
# clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,n_jobs=-1)
best_clf = clf.fit(X_train, y_train)

In [None]:
print("Best cross validation accuracy: {:.2f}".format(best_clf.best_score_))
print("Test set score: {:.2f}".format(best_clf.score(X_test,y_test)))
print("Best parameters: {}".format(best_clf.best_params_))

## Reason for no outputs:

This grid search method would cost a whole day to generate the best hyperparameters. So we recorded the hyperparameters, but once a time, I incidently run this block again, so the output is missing. So we would not run this grid search again and use the returned parameters to train the previous model