In [1]:
# ----------------------------------------------------------------------------------------------------------------------
# Name:         main.py
# Purpose:      Open Project Fall - 2022: Predict mortality with medical data
#               CodaLab competition: https://competitions.codalab.org/competitions/27605#learn_the_details
#
# Author(s):    David Little
#
# Created:      12/17/2021
# Updated:      12/17/2021
# Update Comment(s):
#
# TO DO:
#
# FUTURE WORK:
#
# BUGS TO FIX:
#
# ----------------------------------------------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import time
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## Import

In [2]:
# _______________________ Import __________________________________________________________
all_data = pd.read_csv('mimic_synthetic_train.csv', delimiter=' ', header=None)
col_names = pd.read_csv('mimic_synthetic_feat.csv', delimiter=' ', header=None)
all_data = all_data.iloc[:,1:]
all_data.set_axis(col_names, axis=1, inplace=True)

labels = pd.read_csv('mimic_synthetic_train_labels.csv', delimiter=' ', header=None)
all_data['DIED'] = labels
all_data.dropna(inplace=True)
all_data = all_data.reset_index().iloc[:,1:]

## Identify Constant Columns

In [3]:
# _______________________ Identify constant columns_________________________________
non_dups = []
for column in all_data:
    if all_data[column].unique().size == 1:
        non_dups.append(column)

all_data.drop(non_dups, axis=1, inplace=True)

# _______________________ Drop non-informative _________________________________
all_data = all_data.iloc[:,4:]

In [4]:
all_data

Unnamed: 0,"(INSURANCE,)","(LANGUAGE,)","(RELIGION,)","(MARITAL_STATUS,)","(ETHNICITY,)","(GENDER,)","(AGE,)","(HR_MIN_DAY1,)","(HR_MAX_DAY1,)","(HR_MEAN_DAY1,)",...,"(Coma/brndmg,)","(Retinaldx,)","(Glaucoma,)","(Othereyedx,)","(Othnervdx,)","(Hrtvalvedx,)","(Carditis,)","(HTN,)","(Htncomplicn,)",DIED
0,Private,ENGL,CATHOLIC,DIVORCED,WHITE,F,21,57.207630,108.704030,70.841980,...,0,0,0,0,0.0,0,0,0,0,0
1,Private,ENGL,UNOBTAINABLE,SINGLE,WHITE,M,39,79.331400,121.138664,96.915726,...,0,0,0,0,0.0,0,0,0,1,0
2,Medicaid,SPAN,CATHOLIC,WIDOWED,OTHER,M,40,118.230290,175.175430,140.936700,...,0,0,0,0,0.0,0,0,0,0,0
3,Medicare,ENGL,NOT_SPECIFIED,MARRIED,WHITE,M,75,83.780380,89.188980,86.051155,...,0,0,0,0,0.0,0,0,1,0,0
4,Medicare,ENGL,UNOBTAINABLE,MARRIED,UNKNOWN/NOT_SPECIFIED,F,55,60.878613,105.637500,80.616840,...,0,0,0,0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79981,Medicare,ENGL,NOT_SPECIFIED,UNKNOWN_(DEFAULT),WHITE,F,76,56.035793,97.863950,79.709885,...,0,0,0,0,0.0,0,0,1,0,0
79982,Medicaid,ENGL,JEWISH,MARRIED,WHITE,F,38,55.126850,86.027390,67.647910,...,0,0,0,0,1.0,0,0,0,0,0
79983,Private,ENGL,NOT_SPECIFIED,SEPARATED,WHITE,F,90,74.281746,91.267840,91.425380,...,0,0,0,0,0.0,0,0,0,0,0
79984,Government,ENGL,CATHOLIC,MARRIED,BLACK/AFRICAN_AMERICAN,M,57,79.256140,128.995130,98.742550,...,0,0,0,0,0.0,0,0,1,0,0


## Categorical Data

In [5]:
# _______________________ Just the categorical _________________________________

categorical_variables = all_data.select_dtypes(include='O').keys()
categorical_data = all_data[categorical_variables]
categorical_data

Unnamed: 0,"(INSURANCE,)","(LANGUAGE,)","(RELIGION,)","(MARITAL_STATUS,)","(ETHNICITY,)","(GENDER,)","(ICU,)"
0,Private,ENGL,CATHOLIC,DIVORCED,WHITE,F,SICU
1,Private,ENGL,UNOBTAINABLE,SINGLE,WHITE,M,CCU
2,Medicaid,SPAN,CATHOLIC,WIDOWED,OTHER,M,MICU
3,Medicare,ENGL,NOT_SPECIFIED,MARRIED,WHITE,M,CCU
4,Medicare,ENGL,UNOBTAINABLE,MARRIED,UNKNOWN/NOT_SPECIFIED,F,MICU
...,...,...,...,...,...,...,...
79981,Medicare,ENGL,NOT_SPECIFIED,UNKNOWN_(DEFAULT),WHITE,F,CSRU
79982,Medicaid,ENGL,JEWISH,MARRIED,WHITE,F,TSICU
79983,Private,ENGL,NOT_SPECIFIED,SEPARATED,WHITE,F,SICU
79984,Government,ENGL,CATHOLIC,MARRIED,BLACK/AFRICAN_AMERICAN,M,TSICU


## Feature Selection

In [6]:
# Creating copy of data and performing label encoding on categorical data

temp_data = all_data.copy()

label_encoder = LabelEncoder()

for column in categorical_variables:
    temp_data[column] = label_encoder.fit_transform(temp_data[column])

### CHI SQUARED Test on Categorical Data

In [7]:
X = temp_data[categorical_variables]
y = temp_data.iloc[:,-1]

p_score = chi2(X,y)

feat_p_values = pd.DataFrame({'Specs': X.columns, 'P_Value': p_score[1]})
feat_p_values = feat_p_values.sort_values(by=['P_Value'])
feat_p_values = feat_p_values.reset_index().drop('index', axis=1)
unwanted_categorical_features = list(feat_p_values[feat_p_values['P_Value']>0.05]['Specs'])

### ANOVA Test on Numerical Features

In [8]:
X = temp_data.iloc[:,:164]
y = temp_data.iloc[:,-1]

X.drop(categorical_variables, axis=1, inplace=True)

f_score = f_classif(X,y)

feat_f_values = pd.DataFrame({'Specs': X.columns, 'F_Value': f_score[1]})
feat_f_values = feat_f_values.sort_values(by=['F_Value'])
feat_f_values = feat_f_values.reset_index().drop('index', axis=1)
unwanted_numerical_features = list(feat_f_values[feat_f_values['F_Value']>0.05]['Specs'])

## Updating Data after feature selection

In [9]:
all_data.drop(unwanted_categorical_features, axis=1, inplace=True)
all_data.drop(unwanted_numerical_features, axis=1, inplace=True)
categorical_variables = categorical_variables.drop(unwanted_categorical_features)
all_data

Unnamed: 0,"(RELIGION,)","(GENDER,)","(AGE,)","(HR_MIN_DAY1,)","(HR_MAX_DAY1,)","(HR_MEAN_DAY1,)","(SYS_BP_MIN_DAY1,)","(SYS_BP_MAX_DAY1,)","(SYS_BP_MEAN_DAY1,)","(DIAS_BP_MIN_DAY1,)",...,"(Hepatitis,)","(Acutp-hanm,)","(Coag/hemrdx,)","(Screeningandhistoryofmentalhealthan,)","(Epilepsy/cnv,)","(Othereyedx,)","(Othnervdx,)","(Hrtvalvedx,)","(Htncomplicn,)",DIED
0,CATHOLIC,F,21,57.207630,108.704030,70.841980,79.847060,119.72417,94.435090,26.880910,...,0,0,0,0,0,0,0.0,0,0,0
1,UNOBTAINABLE,M,39,79.331400,121.138664,96.915726,93.994540,138.59406,113.570206,38.621536,...,0,0,0,0,0,0,0.0,0,1,0
2,CATHOLIC,M,40,118.230290,175.175430,140.936700,122.862610,144.25964,134.501220,67.754650,...,0,0,1,1,0,0,0.0,0,0,0
3,NOT_SPECIFIED,M,75,83.780380,89.188980,86.051155,111.390740,144.86057,127.681170,61.139935,...,0,0,0,0,0,0,0.0,0,0,0
4,UNOBTAINABLE,F,55,60.878613,105.637500,80.616840,87.627464,127.75874,99.098860,42.693020,...,0,0,0,0,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79981,NOT_SPECIFIED,F,76,56.035793,97.863950,79.709885,77.546234,120.30758,102.489240,26.716347,...,0,0,0,0,0,0,0.0,0,0,0
79982,JEWISH,F,38,55.126850,86.027390,67.647910,83.051470,174.83446,119.396470,40.401726,...,0,0,0,0,0,0,1.0,0,0,0
79983,NOT_SPECIFIED,F,90,74.281746,91.267840,91.425380,95.857544,138.79803,130.716320,52.676624,...,0,0,0,0,0,0,0.0,0,0,0
79984,CATHOLIC,M,57,79.256140,128.995130,98.742550,107.726430,141.92834,132.398940,57.103550,...,0,0,0,0,0,0,0.0,0,0,0


## One Hot Encoding

In [11]:
#____________________________ One-hot encoding______________________

cats = all_data[categorical_variables]
all_data.drop(cats, axis=1, inplace=True)

enc = OneHotEncoder(handle_unknown='ignore')
# Fit encoding
enc.fit(cats)
# Make conversion
feat = enc.transform(cats).toarray()
feat_names = enc.get_feature_names()
cat_data = pd.DataFrame(feat, columns=feat_names)

all_data = pd.concat([cat_data,all_data], axis=1)
all_data

Unnamed: 0,x0_7TH_DAY_ADVENTIST,x0_BAPTIST,x0_BUDDHIST,x0_CATHOLIC,x0_CHRISTIAN_SCIENTIST,x0_EPISCOPALIAN,x0_GREEK_ORTHODOX,x0_HEBREW,x0_HINDU,x0_JEHOVAH'S_WITNESS,...,"(Hepatitis,)","(Acutp-hanm,)","(Coag/hemrdx,)","(Screeningandhistoryofmentalhealthan,)","(Epilepsy/cnv,)","(Othereyedx,)","(Othnervdx,)","(Hrtvalvedx,)","(Htncomplicn,)",DIED
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,1,0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0
79982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1.0,0,0,0
79983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0
79984,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0,0,0


In [None]:
#_______________________test_train split_________________________________________

In [None]:
#____________________________ Upsampling _________________________________

In [None]:
# _______________________ Modeling _________________________________

import xgboost as xgb

dtrain = xgb.DMatrix(all_data_train.iloc[:,:-1], enable_categorical=True, label=all_data_train['DIED'])

print("Booster parameters")
param = {'max_depth': 10, 'eta': 0.2, 'objective': 'binary:hinge'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
print("train xgboost")
num_round = 20
cls = xgb.train(param, dtrain, num_round)
#bst.save_model('xgboost.model')
dtest = xgb.DMatrix(X_test)

preds = cls.predict(dtest)

from sklearn.metrics import f1_score

acc = f1_score(y_test, preds)
print(acc)

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, preds)
print(acc)


from sklearn.metrics import balanced_accuracy_score
acc = balanced_accuracy_score(y_test, preds)
print(acc)