In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
# _______________________ Import __________________________________________________________
all_data = pd.read_csv('mimic_synthetic_train.csv', delimiter=' ', header=None)
col_names = pd.read_csv('mimic_synthetic_feat.csv', delimiter=' ', header=None)
all_data = all_data.iloc[:,1:]
all_data.set_axis(col_names, axis=1, inplace=True)

labels = pd.read_csv('mimic_synthetic_train_labels.csv', delimiter=' ', header=None)
all_data['DIED'] = labels

In [3]:
# _______________________ Identify constant columns_________________________________
non_dups = []
for column in all_data:
    if all_data[column].unique().size == 1:
        non_dups.append(column)

all_data.drop(non_dups, axis=1, inplace=True)

# _______________________ Drop non-informative _________________________________
all_data = all_data.iloc[:,4:]

In [4]:
all_data

Unnamed: 0,"(INSURANCE,)","(LANGUAGE,)","(RELIGION,)","(MARITAL_STATUS,)","(ETHNICITY,)","(GENDER,)","(AGE,)","(HR_MIN_DAY1,)","(HR_MAX_DAY1,)","(HR_MEAN_DAY1,)",...,"(Coma/brndmg,)","(Retinaldx,)","(Glaucoma,)","(Othereyedx,)","(Othnervdx,)","(Hrtvalvedx,)","(Carditis,)","(HTN,)","(Htncomplicn,)",DIED
0,Private,ENGL,CATHOLIC,DIVORCED,WHITE,F,21,57.207630,108.704030,70.841980,...,0,0,0,0,0.0,0,0,0,0,0
1,Private,ENGL,UNOBTAINABLE,SINGLE,WHITE,M,39,79.331400,121.138664,96.915726,...,0,0,0,0,0.0,0,0,0,1,0
2,Medicaid,SPAN,CATHOLIC,WIDOWED,OTHER,M,40,118.230290,175.175430,140.936700,...,0,0,0,0,0.0,0,0,0,0,0
3,Medicare,ENGL,NOT_SPECIFIED,MARRIED,WHITE,M,75,83.780380,89.188980,86.051155,...,0,0,0,0,0.0,0,0,1,0,0
4,Medicare,ENGL,UNOBTAINABLE,MARRIED,UNKNOWN/NOT_SPECIFIED,F,55,60.878613,105.637500,80.616840,...,0,0,0,0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79994,Medicare,ENGL,NOT_SPECIFIED,UNKNOWN_(DEFAULT),WHITE,F,76,56.035793,97.863950,79.709885,...,0,0,0,0,0.0,0,0,1,0,0
79995,Medicaid,ENGL,JEWISH,MARRIED,WHITE,F,38,55.126850,86.027390,67.647910,...,0,0,0,0,1.0,0,0,0,0,0
79996,Private,ENGL,NOT_SPECIFIED,SEPARATED,WHITE,F,90,74.281746,91.267840,91.425380,...,0,0,0,0,0.0,0,0,0,0,0
79997,Government,ENGL,CATHOLIC,MARRIED,BLACK/AFRICAN_AMERICAN,M,57,79.256140,128.995130,98.742550,...,0,0,0,0,0.0,0,0,1,0,0


## Categorical Variables

In [5]:
categorical_variables = all_data.select_dtypes(include='O').keys()
categorical_variables_df = pd.DataFrame()
for variable in categorical_variables:
    data = all_data[variable].value_counts().sort_values(ascending=False).head(20)
    data = data.reset_index().rename(columns={'index': variable[0], variable[0]: variable[0]+'_COUNT'})
    categorical_variables_df = pd.concat([categorical_variables_df, data], axis=1)
categorical_variables_df

Unnamed: 0,INSURANCE,INSURANCE_COUNT,LANGUAGE,LANGUAGE_COUNT,RELIGION,RELIGION_COUNT,MARITAL_STATUS,MARITAL_STATUS_COUNT,ETHNICITY,ETHNICITY_COUNT,GENDER,GENDER_COUNT,ICU,ICU_COUNT
0,Medicare,44670.0,ENGL,75733,CATHOLIC,30384,MARRIED,43661.0,WHITE,59728,M,46425.0,MICU,27780.0
1,Private,23964.0,SPAN,871,NOT_SPECIFIED,15110,SINGLE,18318.0,UNKNOWN/NOT_SPECIFIED,4343,F,33574.0,CSRU,16789.0
2,Medicaid,6434.0,RUSS,582,UNOBTAINABLE,8761,WIDOWED,9375.0,BLACK/AFRICAN_AMERICAN,3718,,,SICU,12803.0
3,Government,2933.0,PTUN,554,PROTESTANT_QUAKER,8706,DIVORCED,5015.0,OTHER,1647,,,CCU,11770.0
4,Self_Pay,1998.0,CANT,306,JEWISH,7399,UNKNOWN_(DEFAULT),2114.0,HISPANIC_OR_LATINO,1429,,,TSICU,10855.0
5,,,PORT,290,OTHER,4643,SEPARATED,1139.0,ASIAN,1416,,,,
6,,,CAPE,220,EPISCOPALIAN,1491,LIFE_PARTNER,376.0,UNABLE_TO_OBTAIN,1384,,,,
7,,,HAIT,129,GREEK_ORTHODOX,846,,,PATIENT_DECLINED_TO_ANSWER,1085,,,,
8,,,MAND,113,CHRISTIAN_SCIENTIST,773,,,ASIAN_-_CHINESE,541,,,,
9,,,ITAL,108,BUDDHIST,475,,,HISPANIC/LATINO_-_PUERTO_RICAN,466,,,,


In [6]:
all_data.select_dtypes(include='O').keys()

Index([     ('INSURANCE',),       ('LANGUAGE',),       ('RELIGION',),
       ('MARITAL_STATUS',),      ('ETHNICITY',),         ('GENDER',),
                  ('ICU',)],
      dtype='object')

## Label Encoding

In [7]:
all_data.dropna(inplace=True)
label_encoder = LabelEncoder()

for column in categorical_variables:
    all_data[column] = label_encoder.fit_transform(all_data[column])

## CHI SQUARED Test on Categorical features

In [8]:
X = all_data[categorical_variables]
y = all_data.iloc[:,-1]

p_score = chi2(X,y)

feat_p_values = pd.DataFrame({'Specs': X.columns, 'P_Value': p_score[1]})
feat_p_values = feat_p_values.sort_values(by=['P_Value'])
feat_p_values = feat_p_values.reset_index().drop('index', axis=1)
feat_p_values[feat_p_values['P_Value']<0.05]

Unnamed: 0,Specs,P_Value
0,"(RELIGION,)",4.1e-05
1,"(GENDER,)",0.000243
2,"(ICU,)",0.000516


## ANOVA Test

In [9]:
X = all_data.iloc[:,:164]
y = all_data.iloc[:,-1]

X.drop(categorical_variables, axis=1, inplace=True)

f_score = f_classif(X,y)

feat_f_values = pd.DataFrame({'Specs': X.columns, 'F_Value': f_score[1]})
feat_f_values = feat_f_values.sort_values(by=['F_Value'])
feat_f_values = feat_f_values.reset_index().drop('index', axis=1)
feat_f_values[feat_f_values['F_Value']<0.05]

Unnamed: 0,Specs,F_Value
0,"(RESP_RATE_MEAN_DAY1,)",1.826100e-162
1,"(Adltrespfl,)",3.870471e-146
2,"(RESP_RATE_MAX_DAY1,)",1.176419e-145
3,"(RESP_RATE_MEAN_DAY2,)",1.155879e-141
4,"(RESP_RATE_MAX_DAY2,)",1.319713e-131
...,...,...
92,"(MEAN_BP_MAX_DAY2,)",3.418303e-02
93,"(Epilepsy/cnv,)",3.957386e-02
94,"(Thyroiddsor,)",3.958914e-02
95,"(Mycoses,)",4.561267e-02


## Feature Importance using ExtraTreeClassifier on all features

In [10]:
X = all_data.iloc[:,:164]
y = all_data.iloc[:,-1]

model = RandomForestRegressor()
model.fit(X,y)

feat_importances = pd.DataFrame({'Specs': X.columns, 'Importance': model.feature_importances_})
feat_importances = feat_importances.sort_values(by=['Importance'], ascending=False)
feat_importances = feat_importances.reset_index().drop('index', axis=1)

In [11]:
categorical_features_score_df = pd.DataFrame()

for column in categorical_variables:
    data = feat_importances[feat_importances['Specs'] == column]
    categorical_features_score_df = pd.concat([categorical_features_score_df, data], axis=0)
    
categorical_features_score_df.sort_values(by=['Importance'], ascending=False)

Unnamed: 0,Specs,Importance
50,"(RELIGION,)",0.006542
51,"(ETHNICITY,)",0.006326
52,"(ICU,)",0.004196
53,"(MARITAL_STATUS,)",0.004123
54,"(INSURANCE,)",0.003899
56,"(LANGUAGE,)",0.00327
65,"(GENDER,)",0.001619


## Feature Importance using ExtraTreeClassifier on Categorical features

In [12]:
X = all_data[categorical_variables]
y = all_data.iloc[:,-1]

model = RandomForestRegressor()
model.fit(X,y)

feat_importances_categorical = pd.DataFrame({'Specs': X.columns, 'Importance': model.feature_importances_})
feat_importances_categorical = feat_importances_categorical.sort_values(by=['Importance'], ascending=False)
feat_importances_categorical

Unnamed: 0,Specs,Importance
4,"(ETHNICITY,)",0.281217
2,"(RELIGION,)",0.251596
3,"(MARITAL_STATUS,)",0.181429
6,"(ICU,)",0.115966
1,"(LANGUAGE,)",0.074949
5,"(GENDER,)",0.064316
0,"(INSURANCE,)",0.030526


## Mutual Information Classification

In [13]:
X = all_data.iloc[:,:164]
y = all_data.iloc[:,-1]

mic_score = MIC(X,y)

feat_mic_score = pd.DataFrame({'Specs': X.columns, 'MIC_Score': mic_score})
feat_mic_score = feat_mic_score.sort_values(by=['MIC_Score'], ascending=False)
feat_mic_score = feat_mic_score.reset_index().drop('index', axis=1)
feat_mic_score

Unnamed: 0,Specs,MIC_Score
0,"(LANGUAGE,)",0.016249
1,"(ETHNICITY,)",0.013092
2,"(INSURANCE,)",0.008431
3,"(GENDER,)",0.006778
4,"(MARITAL_STATUS,)",0.006470
...,...,...
159,"(Cardiaarrst,)",0.000000
160,"(ECodes:Placeofoccurrence,)",0.000000
161,"(Prostatecan,)",0.000000
162,"(Mycoses,)",0.000000


In [14]:
z = list(feat_p_values[feat_p_values['P_Value']<0.05]['Specs'])
categorical_variables = categorical_variables.drop(z)
categorical_variables

Index([('INSURANCE',), ('LANGUAGE',), ('MARITAL_STATUS',), ('ETHNICITY',)], dtype='object')