In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Input data
train = pd.read_csv('TrainingWiDS2021.csv')
test = pd.read_csv('UnlabeledWiDS2021.csv')
test = test.sort_values(by='encounter_id') 


# split out train labels
train_labels = train[['diabetes_mellitus']]
train = train.drop(['Unnamed: 0', 'diabetes_mellitus'], axis=1)
test = test.drop(['Unnamed: 0'], axis=1)

print("Train:", train.shape)
print("Train label:", train_labels.shape)
print("Test:", test.shape)

Train: (130157, 179)
Train label: (130157, 1)
Test: (10234, 179)


In [3]:
cols_by_dtype = train.columns.to_series().groupby(train.dtypes).groups
cols_by_dtype = {k.name: v for k, v in cols_by_dtype.items()}
cols_by_dtype

{'int64': Index(['encounter_id', 'hospital_id', 'elective_surgery', 'icu_id',
        'readmission_status', 'apache_post_operative', 'arf_apache',
        'intubated_apache', 'ventilated_apache', 'aids', 'cirrhosis',
        'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma',
        'solid_tumor_with_metastasis'],
       dtype='object'),
 'float64': Index(['age', 'bmi', 'height', 'pre_icu_los_days', 'weight', 'albumin_apache',
        'apache_2_diagnosis', 'apache_3j_diagnosis', 'bilirubin_apache',
        'bun_apache',
        ...
        'd1_pao2fio2ratio_max', 'd1_pao2fio2ratio_min', 'h1_arterial_pco2_max',
        'h1_arterial_pco2_min', 'h1_arterial_ph_max', 'h1_arterial_ph_min',
        'h1_arterial_po2_max', 'h1_arterial_po2_min', 'h1_pao2fio2ratio_max',
        'h1_pao2fio2ratio_min'],
       dtype='object', length=157),
 'object': Index(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source',
        'icu_stay_type', 'icu_type'],
       dtype='object')

In [4]:
def miss_val_percent(df, thresh = 70.0):
    """
    Remove all features that have a missing value percentage greater than 70%
    """
    mvp = ((df.isnull().sum()) / (len(df))) * 100
    return mvp[mvp > thresh]

In [5]:
# create retained features list
retained_features = set(train.columns) - set(miss_val_percent(train).index)

# subset train and test sets
train = train[retained_features]
test = test[retained_features]

# check
print("Train:", train.shape)
print("Test:", test.shape)

Train: (130157, 124)
Test: (10234, 124)


In [6]:
cols_by_dtype = train.columns.to_series().groupby(train.dtypes).groups
cols_by_dtype = {k.name: v for k, v in cols_by_dtype.items()}
cols_by_dtype

{'int64': Index(['cirrhosis', 'aids', 'solid_tumor_with_metastasis', 'intubated_apache',
        'leukemia', 'arf_apache', 'lymphoma', 'encounter_id',
        'readmission_status', 'hepatic_failure', 'immunosuppression',
        'apache_post_operative', 'icu_id', 'hospital_id', 'ventilated_apache',
        'elective_surgery'],
       dtype='object'),
 'float64': Index(['h1_temp_max', 'd1_mbp_noninvasive_min', 'heart_rate_apache',
        'd1_heartrate_min', 'h1_diasbp_max', 'd1_diasbp_min', 'd1_spo2_min',
        'h1_mbp_noninvasive_min', 'd1_bun_max', 'd1_platelets_max',
        ...
        'gcs_unable_apache', 'd1_temp_max', 'd1_sodium_min',
        'd1_mbp_noninvasive_max', 'd1_diasbp_noninvasive_max', 'd1_calcium_min',
        'weight', 'h1_sysbp_max', 'd1_resprate_max', 'urineoutput_apache'],
       dtype='object', length=102),
 'object': Index(['ethnicity', 'hospital_admit_source', 'icu_stay_type', 'gender',
        'icu_admit_source', 'icu_type'],
       dtype='object')}

In [7]:
def get_row_miss_percent(df):
    """
    Adds a feature into the dataframe indicating the missing
    value percentage of the corresponding row
    """
    ncols = df.shape[1]
    df['miss_percent'] = (df.isnull().sum(axis=1) / ncols) * 100
    
    return df

In [8]:
# calculate missing percent per row for train and test
get_row_miss_percent(train)
get_row_miss_percent(test)

# subset train
train = train.loc[train['miss_percent'] < 30]

# check
print("Train:", train.shape)
print("Test:", test.shape)

Train: (119132, 125)
Test: (10234, 125)


In [9]:
# create the categorical imputer object
cat_imputer = SimpleImputer(missing_values = np.nan, strategy = "constant", fill_value = "missing")

# subset data to have only categorical features
train_cat = train[cols_by_dtype['object']]
test_cat = test[cols_by_dtype['object']]

# impute
train_cat.iloc[:, :] = cat_imputer.fit_transform(train_cat)
test_cat.iloc[:, :] = cat_imputer.fit_transform(test_cat)

In [10]:

cont_imputer = SimpleImputer(missing_values = np.nan, strategy = "median")

# subset data to have only continuous features
train_cont_int = train[cols_by_dtype['int64']]
train_cont_float = train[cols_by_dtype['float64']]
test_cont_int = test[cols_by_dtype['int64']]
test_cont_float = test[cols_by_dtype['float64']]

# impute
train_cont_int.iloc[:, :] = cont_imputer.fit_transform(train_cont_int)
train_cont_float.iloc[:, :] = cont_imputer.fit_transform(train_cont_float)
test_cont_int.iloc[:, :] = cont_imputer.fit_transform(test_cont_int)
test_cont_float.iloc[:, :] = cont_imputer.fit_transform(test_cont_float)

In [11]:
new_train = pd.concat([train_cat, train_cont_int, train_cont_float], axis=1)
new_test = pd.concat([test_cat, test_cont_int, test_cont_float], axis=1)

# check
print("Train:", new_train.shape)
print("Test:", new_test.shape)

Train: (119132, 124)
Test: (10234, 124)


In [12]:
new_train_labels = train_labels.iloc[train.index]

# check
print("Train labels:", new_train_labels.shape)

Train labels: (119132, 1)


In [13]:
# one hot encode data
new_train = pd.get_dummies(new_train)
new_test = pd.get_dummies(new_test)

# remove cols from train that are not present in test
new_train = new_train[new_test.columns]

# check
print("Train:", new_train.shape)
print("Test:", new_test.shape)

Train: (119132, 156)
Test: (10234, 156)


In [14]:
new_train = new_train.drop(["encounter_id"],axis=1)
new_test = new_test.drop(["encounter_id"],axis=1)

# create standard scaler object with training data
scaler = StandardScaler().fit(new_train)

# apply scaler to train and test data
new_train = pd.DataFrame(scaler.transform(new_train))
new_test = pd.DataFrame(scaler.transform(new_test))

# check
print("Train:", new_train.shape)
print("Test:", new_test.shape)

Train: (119132, 155)
Test: (10234, 155)


In [22]:
#After tuning model on subset of training data, train on the entire labeled dataset
X_train= new_train
y_train = new_train_labels
  
#Make XGBoost Classifier
xgb_clf = XGBClassifier(eval_metric = "error", learning_rate = 0.1, colsample_bytree = 0.8, gamma =0.5, min_child_weight=0.75)

# fit classifier
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='error',
              gamma=0.5, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=0.75, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [23]:
#Predict for unlabeled data
y_score = xgb_clf.predict_proba(new_test)[:, 1]

In [24]:
#Compile submission file
submission = pd.DataFrame()
submission['encounter_id'] = test['encounter_id']
submission['diabetes_mellitus'] = y_score
submission.to_csv('submission.csv', index=False)