In [1]:
# ----------------------------------------------------------------------------------------------------------------------
# Name:         tiffany_main_notebook.ipynb
# Purpose:      Open Project Summer - 2022: Predict mortality with medical data
#               CodaLab competition: https://competitions.codalab.org/competitions/27605#learn_the_details
#
# Author(s):    Tiffany Kashima
#
# Created:      6/20/2022
# Updated:      7/21/2022
# Update Comment(s):
#
# TO DO:
#
# FUTURE WORK:
#
# BUGS TO FIX:
#
# ----------------------------------------------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import time
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
import numpy as np
import tensorflow as tf
import random as python_random

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
python_random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(1234)

In [3]:
# _______________________ Import __________________________________________________________
all_data = pd.read_csv('mimic_synthetic_train.csv', delimiter=' ', header=None)
col_names = pd.read_csv('mimic_synthetic_feat.csv', delimiter=' ', header=None)
all_data = all_data.iloc[:,1:]
all_data.set_axis(col_names, axis=1, inplace=True)

labels = pd.read_csv('mimic_synthetic_train_labels.csv', delimiter=' ', header=None)
all_data['DIED'] = labels
all_data.dropna(inplace=True)
all_data = all_data.reset_index().iloc[:,1:]

In [4]:
# _______________________ Identify constant columns_________________________________
non_dups = []
for column in all_data:
    if all_data[column].unique().size == 1:
        non_dups.append(column)

all_data.drop(non_dups, axis=1, inplace=True)

# _______________________ Drop non-informative _________________________________
all_data = all_data.iloc[:,4:]

In [5]:
# _______________________ Just the categorical _________________________________

categorical_variables = all_data.select_dtypes(include='O').keys()

In [6]:
# _______________________ Feature Selection _________________________________

# Creating copy of data and performing label encoding on categorical data

temp_data = all_data.copy()

label_encoder = LabelEncoder()

for column in categorical_variables:
    temp_data[column] = label_encoder.fit_transform(temp_data[column])

In [7]:
# CHI Squared Test for Categorical Data

X = temp_data[categorical_variables]
y = temp_data.iloc[:,-1]

p_score = chi2(X,y)

feat_p_values = pd.DataFrame({'Specs': X.columns, 'P_Value': p_score[1]})
feat_p_values = feat_p_values.sort_values(by=['P_Value'])
feat_p_values = feat_p_values.reset_index().drop('index', axis=1)
unwanted_categorical_features = list(feat_p_values[feat_p_values['P_Value']>0.01]['Specs'])

# unwanted_categorical_features = categorical_variables

In [8]:
# ANOVA Test for Numerical Features

X = temp_data.iloc[:,:164]
y = temp_data.iloc[:,-1]

X.drop(categorical_variables, axis=1, inplace=True)

f_score = f_classif(X,y)

feat_f_values = pd.DataFrame({'Specs': X.columns, 'F_Value': f_score[1]})
feat_f_values = feat_f_values.sort_values(by=['F_Value'])
feat_f_values = feat_f_values.reset_index().drop('index', axis=1)
unwanted_numerical_features = list(feat_f_values[feat_f_values['F_Value']>0.01]['Specs'])

In [9]:
# Applying feature selection

# all_data.drop(unwanted_numerical_features, axis=1, inplace=True)
# all_data.drop(unwanted_categorical_features, axis=1, inplace=True)
# categorical_variables = categorical_variables.drop(unwanted_categorical_features)

In [10]:
#____________________________ One-hot encoding______________________

cats = all_data[categorical_variables]
all_data.drop(cats, axis=1, inplace=True)

enc = OneHotEncoder(handle_unknown='ignore')
# Fit encoding
enc.fit(cats)
# Make conversion
feat = enc.transform(cats).toarray()
feat_names = enc.get_feature_names()
cat_data = pd.DataFrame(feat, columns=feat_names)

all_data = pd.concat([cat_data,all_data], axis=1)



In [11]:
#_______________________test_train split_________________________________________

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(all_data.iloc[:,:-1], all_data['DIED'], test_size=0.2, random_state=42)

In [12]:
#____________________________ Upsampling _________________________________

from sklearn.utils import resample

all_data_train = pd.concat([X_train,y_train], axis=1)

ones = all_data_train[all_data_train['DIED'] == 1]
zeros = all_data_train[all_data_train['DIED'] == 0]

upsampled = resample(ones, n_samples=len(zeros), replace=True, random_state=42)
all_data_train = pd.concat([zeros,upsampled], axis=0,ignore_index=True)

In [13]:
import keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(305, input_dim=305, activation='relu'))
model.add(Dense(230, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

history = model.fit(all_data_train.iloc[:, :-1], all_data_train['DIED'], epochs=25, batch_size=64)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
preds = model.predict(X_test)

preds = [0 if pred < 0.5 else 1 for pred in preds]

from sklearn.metrics import f1_score
f1 = f1_score(y_test, preds)
    
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, preds)

from sklearn.metrics import balanced_accuracy_score
bal_acc = balanced_accuracy_score(y_test, preds)

print('F1 Score:', f1)
print('Accuracy:', acc)
print('Balanced Accuracy:', bal_acc)

F1 Score: 0.19703342926721273
Accuracy: 0.773284160520065
Balanced Accuracy: 0.7684776639892644


In [15]:
#______________________________________ Predict test case & save _________________________________________
test_data = pd.read_csv('mimic_synthetic_test.csv', delimiter=' ', header=None)
col_names = pd.read_csv('mimic_synthetic_feat.csv', delimiter=' ', header=None)
test_data = test_data.iloc[:,1:]
test_data.set_axis(col_names, axis=1, inplace=True)
test_data.fillna('0', inplace=True)

test_data.drop(non_dups, axis=1, inplace=True)

# _______________________ Drop non-informative _________________________________
test_data = test_data.iloc[:,4:]

# # _________________________ Feature Selection __________________________________

# test_data.drop(unwanted_numerical_features, axis=1, inplace=True)
# test_data.drop(unwanted_categorical_features, axis=1, inplace=True)

# _______________________ Just the categorical _________________________________

cats = test_data[categorical_variables]
test_data.drop(cats, axis=1, inplace=True)

#____________________________ One-hot encoding_________________________________

feat = enc.transform(cats).toarray()
feat_names = enc.get_feature_names()
cat_data = pd.DataFrame(feat, columns=feat_names)

#______________________________ Make prediction_________________________________
test_data = pd.concat([cat_data,test_data], axis=1)

test_data = test_data.astype('float')

preds = model.predict(test_data)
preds = [0 if pred < 0.5 else 1 for pred in preds]

np.savetxt("mimic_synthetic_test_prediction.csv", preds, delimiter=",")

