In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer #,IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
#import data
data = pd.read_csv('sph6004_assignment1_data.csv')
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50920 entries, 0 to 50919
Data columns (total 162 columns):
 #    Column                  Non-Null Count  Dtype  
---   ------                  --------------  -----  
 0    id                      50920 non-null  int64  
 1    aki                     50920 non-null  int64  
 2    gender                  50920 non-null  object 
 3    admission_age           50920 non-null  float64
 4    race                    50920 non-null  object 
 5    heart_rate_min          50841 non-null  float64
 6    heart_rate_max          50841 non-null  float64
 7    heart_rate_mean         50841 non-null  float64
 8    sbp_min                 50823 non-null  float64
 9    sbp_max                 50823 non-null  float64
 10   sbp_mean                50823 non-null  float64
 11   dbp_min                 50823 non-null  float64
 12   dbp_max                 50823 non-null  float64
 13   dbp_mean                50823 non-null  float64
 14   mbp_min             

Data Preprocessing

In [3]:
#drop ID column as not useful in prediction
df = data[data.columns[1:]]

# Generate percentage missing df
percent_missing = df.isnull().sum()*100/len(df)
percent_missing_df = pd.DataFrame({"Percent Missing": percent_missing})

# Sort the percentage and observe which are missing data
percent_missing_df.sort_values(ascending=False, by="Percent Missing", inplace=True)
pd.set_option('display.max_rows', 500)
percent_missing_df

Unnamed: 0,Percent Missing
thrombin_min,99.821288
thrombin_max,99.821288
d_dimer_min,99.785939
d_dimer_max,99.785939
ggt_max,99.073056
ggt_min,99.073056
globulin_min,98.654753
globulin_max,98.654753
bicarbonate_min,98.332679
bicarbonate_max,98.332679


In [4]:
#drop features where over 50% data missing
df_dropped_features = df.dropna(thresh=0.5*len(df), axis=1)
#df_dropped_features = df_dropped_features.dropna(thresh=0.5*len(df_dropped_features.columns), axis=0)
df_dropped_features

Unnamed: 0,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,...,pt_min,pt_max,ptt_min,ptt_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,weight_admit
0,3,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,116.136364,...,11.9,12.0,28.6,29.5,15.0,6.0,5.0,4.0,0.0,110.0
1,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,109.833333,...,20.0,24.7,28.4,150.0,15.0,6.0,5.0,4.0,0.0,82.0
2,2,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,112.153846,...,12.1,12.1,26.0,26.0,15.0,6.0,5.0,4.0,0.0,62.1
3,2,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,138.160000,...,12.8,12.8,26.1,26.1,15.0,1.0,0.0,1.0,1.0,113.1
4,2,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,97.672131,...,10.4,11.9,27.7,39.4,15.0,,0.0,1.0,1.0,97.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50915,0,M,66.174854,WHITE,46.0,104.0,84.652174,99.0,162.0,130.038462,...,12.3,12.3,29.8,29.8,13.0,6.0,3.0,4.0,0.0,107.7
50916,2,F,57.033913,WHITE,94.0,112.0,102.040000,89.0,108.0,97.500000,...,,,,,15.0,6.0,0.0,2.0,1.0,44.2
50917,0,F,55.399450,WHITE,68.0,162.0,91.666667,83.0,132.0,113.000000,...,9.6,10.0,26.0,26.1,3.0,1.0,1.0,1.0,0.0,80.0
50918,3,F,66.058622,PORTUGUESE,58.0,82.0,65.076923,92.0,152.0,112.916667,...,15.9,15.9,20.3,20.3,14.0,6.0,5.0,3.0,0.0,95.1


Observations in dataset(apart from label, AKI): 
There are 2 binary variables: Gender and gcs_unable
There is 1 categorical variable: Race
There are 5 ordinal features: gcs_min(15max), gcs_motor(6max), gcs_verbal(5max), gcs_eyes(5max)
Other comments regarding gcs:
    When gcs_unable is 1, gcs_verbal is 0 and gcs_min is 15,
    When whenever gcs_verbal is 0, gcs is 15
    When there is at least 1 non-NA among gcs_motor/gcs_verbal/gcs_eyes, then the NA are replaced with max in calculating gcs_min. If all 3 are NA then gcs_min is NA 

In [5]:
#Divide according to type of feature
# Ordinal
ordinal_features = df_dropped_features[['gcs_min','gcs_motor','gcs_verbal','gcs_eyes']]

# Binary
binary_features = df_dropped_features[['gender','gcs_unable']]
binary_features.gender[binary_features['gender'] == 'M'] = 1
binary_features.gender[binary_features['gender'] == 'F'] = 0

# Categorical
cat_features = df_dropped_features[['race']]
cat_features.race[cat_features['race'] == 'UNABLE TO OBTAIN'] = 'UNKNOWN'
cat_features.race[cat_features['race'] == 'PATIENT DECLINED TO ANSWER'] = 'UNKNOWN'

# Dependent,y
y = df_dropped_features[['aki']]

df_numeric = df_dropped_features.drop(columns=['gcs_min','gcs_motor','gcs_verbal','gcs_eyes',
                                              'gender','gcs_unable','race','aki'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binary_features.gender[binary_features['gender'] == 'M'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binary_features.gender[binary_features['gender'] == 'F'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_features.race[cat_features['race'] == 'UNABLE TO OBTAIN'] = 'UNKNOWN'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [6]:
#Dealing with missing data for numeric,binary and ordinal

#impute missing numeric data with simpleimputer using mean, fast
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_mean = imputer_mean.fit_transform(df_numeric) 
df_numeric_imputed = pd.DataFrame(imputed_mean, columns=df_numeric.columns)

# #Alternative: impute missing data with iterativeimputer, takes long
# imputer_iter = IterativeImputer(max_iter=10, random_state=0)
# imputed_iter = imputer_iter.fit_transform(df_numeric)
# df_numeric_imputed = pd.DataFrame(imputed_iter, columns=df_numeric.columns)

#impute missing binary and ordinal
imputer_mostfreq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputed_binary = imputer_mostfreq.fit_transform(binary_features)
binary_imputed = pd.DataFrame(imputed_binary, columns=binary_features.columns)
imputed_ordinal = imputer_mostfreq.fit_transform(ordinal_features)
ordinal_imputed = pd.DataFrame(imputed_ordinal, columns=ordinal_features.columns)

In [7]:
#one hot encoding race
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(cat_features[["race"]])
df_encoded = pd.concat([cat_features,one_hot_encoded],axis=1).drop(columns=["race"])

In [8]:
#normalize numeric data, try StandardScaler or Normalizer
scaler_numeric = StandardScaler()
#scaler_numeric = Normalizer()
numeric_scaled = scaler_numeric.fit_transform(df_numeric_imputed)
scaled_numeric = pd.DataFrame(numeric_scaled, columns=df_numeric_imputed.columns)

#normalize ordinal with minmaxscaler
scaler_ordinal = MinMaxScaler()
ordinal_scaled = scaler_ordinal.fit_transform(ordinal_imputed)
scaled_ordinal = pd.DataFrame(ordinal_scaled, columns=ordinal_imputed.columns)

Split data to training and testing data, convert AKI to binary

In [9]:
#convert to binary outcome: one vs all technique
y['aki'] = np.where(y['aki'] >= 1, 1, 0)

#join numeric,binary,cat and ordinal features
dfs = [scaled_numeric, binary_imputed, scaled_ordinal,df_encoded]
x = dfs[0].join(dfs[1:])

#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=88)

y.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['aki'] = np.where(y['aki'] >= 1, 1, 0)


aki
1      34060
0      16860
Name: count, dtype: int64

There is potential class imbalance as there are more than twice the number of AKI cases vs non-cases. Try SMOTE

In [10]:
from imblearn.over_sampling import SMOTE

# Use SMOTE to resample minority class.
smote_sampler = SMOTE(random_state=12,sampling_strategy='minority')
x_df_SMOTE, y_df_SMOTE = smote_sampler.fit_resample(x_train, y_train)
y_df_SMOTE.value_counts()

aki
0      23806
1      23806
Name: count, dtype: int64

Initial evaluation using Logistic Regression before feature selection algorithms

In [11]:
#initialize ,L2reg by default
LOG_REG = LogisticRegression(max_iter=500)
LOG_REG.fit(x_train,y_train.values.ravel())
y_pred = LOG_REG.predict(x_test)
print("Logistic Regression before feature selection:")
print("AUC:", roc_auc_score(y_test, y_pred))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred))

Logistic Regression before feature selection:
AUC: 0.6537235578317894
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.42      0.51      5022
           1       0.76      0.89      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.65      0.66     15276
weighted avg       0.72      0.73      0.72     15276



In [12]:
#there is difference in f1 score of 0 vs 1, try SMOTE dataset
#LOG_REG = LogisticRegression(max_iter=500)
LOG_REG.fit(x_df_SMOTE,y_df_SMOTE.values.ravel())
y_pred = LOG_REG.predict(x_test)
print("Logistic Regression on SMOTE data before feature selection:")
print("AUC:", roc_auc_score(y_test, y_pred))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred))

Logistic Regression on SMOTE data before feature selection:
AUC: 0.6894591824060734
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.68      0.59      5022
           1       0.82      0.70      0.75     10254

    accuracy                           0.69     15276
   macro avg       0.67      0.69      0.67     15276
weighted avg       0.72      0.69      0.70     15276



SMOTE does not make a big difference to F1 or AUC 

Feature selection: Try 3 different feature selection techniques(RFE,ExtraTrees and LASSO)
Consider implementing an 'ensemble' method by looking at the intersection between the features

In [13]:
#RFE for feature selection
from sklearn.feature_selection import RFE

#use LR as base algo
#model = LogisticRegression()
rfe = RFE(LOG_REG)
fit = rfe.fit(x_train, y_train.values.ravel())

#get top features from RFE
df_features_RFE = pd.DataFrame({'features': x_train.columns.values, 'rank': fit.ranking_}, columns=['features', 'rank'])
top_features_RFE = df_features_RFE[df_features_RFE['rank'] == 1]

In [14]:
#test performance with features selected by RFE using LR

feature_subset_RFE = top_features_RFE.features.values
x_train_RFE = x_train[feature_subset_RFE].copy()
x_test_RFE =  x_test[feature_subset_RFE].copy()
LOG_REG.fit(x_train_RFE,y_train.values.ravel())
y_pred_RFE = LOG_REG.predict(x_test_RFE)
print("Logistic Regression with RFE features:")
print("AUC:", roc_auc_score(y_test, y_pred_RFE))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_RFE))

Logistic Regression with RFE features:
AUC: 0.6485138882189285
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.41      0.50      5022
           1       0.75      0.89      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.65      0.66     15276
weighted avg       0.72      0.73      0.71     15276



In [15]:
#ExtraTrees for feature selection
from sklearn.ensemble import ExtraTreesClassifier

model_xtrees = ExtraTreesClassifier()
model_xtrees.fit(x_train, y_train.values.ravel())
df_features_Xtree = pd.DataFrame({'features': x_train.columns.values, 'rank': model_xtrees.feature_importances_}, columns=['features', 'rank'])
top_features_Xtree = df_features_Xtree.sort_values('rank',ascending=False)
#as ExtraTrees does not eliminate features, use the length of feature array from RFE as a guide to select top n features
top_features_Xtree = top_features_Xtree.head(57)

In [17]:
#test performance with features selected by ExtraTrees using LR

feature_subset_Xtrees = top_features_Xtree.features.values
x_train_xtrees = x_train[feature_subset_Xtrees].copy()
x_test_xtrees =  x_test[feature_subset_Xtrees].copy()
LOG_REG.fit(x_train_xtrees,y_train.values.ravel())
y_pred_xtrees = LOG_REG.predict(x_test_xtrees)
print("Logistic Regression with ExtraTree features:")
print("AUC:", roc_auc_score(y_test, y_pred_xtrees))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_xtrees))

Logistic Regression with ExtraTree features:
AUC: 0.6502550276734387
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.41      0.50      5022
           1       0.75      0.89      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.65      0.66     15276
weighted avg       0.72      0.73      0.71     15276



In [18]:
#LASSO for feature selection

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, KFold

model_lasso = Lasso(max_iter=1000,tol=0.01) 
#model_lasso.fit(x_train,y_train)
#y_pred_lasso = model.predict(x_test)

# parameters to be tested on GridSearchCV
hyperparams = {'alpha':[0.0001,0.001,0.005,0.01,1]}

# Number of Folds and adding the random state for replication
kf=KFold(n_splits=5,shuffle=True, random_state=42)

# GridSearchCV with model, params and folds.
model_lasso_cv = GridSearchCV(model_lasso, param_grid=hyperparams,scoring='neg_mean_squared_error', cv=kf)
model_lasso_cv.fit(x_train, y_train)


In [19]:
# calling the model with the best parameter
lasso1 = Lasso(alpha = model_lasso_cv.best_params_['alpha'])
lasso1.fit(x_train, y_train)

# Using np.abs() to make coefficients positive.  
lasso1_coef = np.abs(lasso1.coef_)

feature_subset_lasso = np.array(x_train.columns)[lasso1_coef>0]
feature_subset_lasso

array(['admission_age', 'heart_rate_min', 'heart_rate_max',
       'heart_rate_mean', 'sbp_min', 'sbp_max', 'sbp_mean', 'dbp_min',
       'dbp_max', 'dbp_mean', 'mbp_min', 'mbp_max', 'mbp_mean',
       'resp_rate_min', 'resp_rate_max', 'resp_rate_mean',
       'temperature_min', 'temperature_max', 'temperature_mean',
       'spo2_max', 'glucose_min', 'glucose_mean', 'lactate_min',
       'lactate_max', 'ph_min', 'ph_max', 'po2_min', 'po2_max',
       'pco2_max', 'baseexcess_min', 'totalco2_min', 'totalco2_max',
       'hematocrit_max.1', 'hemoglobin_min.1', 'platelets_min',
       'platelets_max', 'wbc_min', 'wbc_max', 'aniongap_min',
       'bicarbonate_min.1', 'bicarbonate_max.1', 'bun_min',
       'calcium_min.1', 'calcium_max.1', 'chloride_min.1',
       'chloride_max.1', 'glucose_min.2', 'sodium_max.1',
       'potassium_max.1', 'abs_basophils_max', 'abs_eosinophils_min',
       'abs_lymphocytes_max', 'abs_monocytes_max', 'abs_neutrophils_min',
       'abs_neutrophils_max', 'inr_m

In [20]:
#test performance with features selected by LASSO using LR

x_train_lasso = x_train[feature_subset_lasso].copy()
x_test_lasso =  x_test[feature_subset_lasso].copy()
LOG_REG.fit(x_train_lasso,y_train.values.ravel())
y_pred_lasso = LOG_REG.predict(x_test_lasso)
print("Logistic Regression with LASSO features:")
print("AUC:", roc_auc_score(y_test, y_pred_lasso))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_lasso))

Logistic Regression with LASSO features:
AUC: 0.6500254934461569
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.41      0.50      5022
           1       0.75      0.89      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.65      0.66     15276
weighted avg       0.72      0.73      0.71     15276



In [46]:
#get intersection of features from 3 techniques

joined_features = np.intersect1d(np.intersect1d(feature_subset_RFE,feature_subset_Xtrees),feature_subset_lasso)
print(list(joined_features))

['admission_age', 'bicarbonate_max.1', 'bicarbonate_min.1', 'bun_min', 'calcium_max.1', 'calcium_min.1', 'chloride_max.1', 'chloride_min.1', 'dbp_mean', 'gcs_eyes', 'gcs_verbal', 'glucose_min', 'heart_rate_max', 'heart_rate_mean', 'heart_rate_min', 'hematocrit_max.1', 'hemoglobin_min.1', 'mbp_mean', 'mbp_min', 'platelets_max', 'platelets_min', 'ptt_max', 'resp_rate_mean', 'resp_rate_min', 'sbp_max', 'sbp_mean', 'sbp_min', 'temperature_max', 'temperature_mean', 'weight_admit']


In [23]:
#test performance with intersection of features from 3 feature selection techniques using LR

x_train_joined = x_train[joined_features].copy()
x_test_joined =  x_test[joined_features].copy()
LOG_REG.fit(x_train_joined,y_train.values.ravel())
y_pred_joined = LOG_REG.predict(x_test_joined)
print("Logistic Regression with features from intersection of 3 feature selections:")
print("AUC:", roc_auc_score(y_test, y_pred_joined))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_joined))

Logistic Regression with features from intersection of 3 feature selections:
AUC: 0.6447446332683879
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.40      0.49      5022
           1       0.75      0.89      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.64      0.65     15276
weighted avg       0.72      0.73      0.71     15276



It appears that reducing the number of features down to 30 does not have a big impact on model performance.

In [47]:
# Forward feature selection.
model = LogisticRegression(penalty='l1',C=1/10,solver='liblinear') # c: 1/(strength of L1 regularization)


forward_selection = SFS(
    model, n_features_to_select=10, direction="forward"
).fit(x_train, y_train.values.ravel())

forward_selection.get_feature_names_out()


array(['admission_age', 'sbp_min', 'sbp_max', 'resp_rate_max',
       'hemoglobin_min.1', 'hemoglobin_max.1', 'bun_max', 'inr_min',
       'weight_admit', 'gcs_verbal'], dtype=object)

In [24]:
#test top 10 from forward feat selection

top10forward = ['admission_age', 'sbp_min', 'sbp_max', 'resp_rate_max',
       'hemoglobin_min.1', 'hemoglobin_max.1', 'bun_max', 'inr_min',
       'weight_admit', 'gcs_verbal']
x_train_top10forward = x_train[top10forward].copy()
x_test_top10forward =  x_test[top10forward].copy()
LOG_REG.fit(x_train_top10forward,y_train.values.ravel())
y_pred_top10forward = LOG_REG.predict(x_test_top10forward)
print("Logistic Regression with features from intersection of 3 feature selections:")
print("AUC:", roc_auc_score(y_test, y_pred_top10forward))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_top10forward))

Logistic Regression with features from intersection of 3 feature selections:
AUC: 0.6328417106335401
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.37      0.47      5022
           1       0.74      0.90      0.81     10254

    accuracy                           0.72     15276
   macro avg       0.69      0.63      0.64     15276
weighted avg       0.71      0.72      0.70     15276



In [25]:
#intersection between top 10 and other 30 features
joined_features_top10 = np.intersect1d(joined_features,top10forward)
joined_features_top10 

array(['admission_age', 'gcs_verbal', 'hemoglobin_min.1', 'sbp_max',
       'sbp_min', 'weight_admit'], dtype=object)

In [219]:
#test top 10with jioned from forward feat selection

x_train_top10forward_1 = x_train[joined_features_top10 ].copy()
x_test_top10forward_1 =  x_test[joined_features_top10 ].copy()
LOG_REG.fit(x_train_top10forward_1,y_train.values.ravel())
y_pred_top10forward_1 = LOG_REG.predict(x_test_top10forward_1)
print("Logistic Regression with features from intersection of 3 feature selections:")
print("AUC:", roc_auc_score(y_test, y_pred_top10forward_1))
print("Classification Report:\n",metrics.classification_report(y_test,y_pred_top10forward_1))

Logistic Regression with features from intersection of 3 feature selections:
AUC: 0.6138170516666399
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.32      0.43      5022
           1       0.73      0.90      0.81     10254

    accuracy                           0.71     15276
   macro avg       0.68      0.61      0.62     15276
weighted avg       0.70      0.71      0.68     15276



In [65]:
#LASSO Reg

L1LR = LogisticRegression(penalty='l1',C=1/10,solver='liblinear')
L1LR.fit(x_df_SMOTE, y_df_SMOTE.values.ravel())
y_pred_L1LR = L1LR.predict(x_test)

print("LASSO Evaluation:")
print("AUC:", roc_auc_score(y_test, y_pred_L1LR))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_L1LR))

LASSO Evaluation:
AUC: 0.6888234192024374
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.69      0.59      5022
           1       0.82      0.69      0.75     10254

    accuracy                           0.69     15276
   macro avg       0.67      0.69      0.67     15276
weighted avg       0.72      0.69      0.70     15276



In [26]:
x_train_joined

Unnamed: 0,admission_age,bicarbonate_max.1,bicarbonate_min.1,bun_min,calcium_max.1,calcium_min.1,chloride_max.1,chloride_min.1,dbp_mean,gcs_eyes,...,platelets_min,ptt_max,resp_rate_mean,resp_rate_min,sbp_max,sbp_mean,sbp_min,temperature_max,temperature_mean,weight_admit
45374,0.370212,0.641547,0.464527,-0.758068,-7.156548e-01,-1.826208e+00,1.187640,1.109794,-0.779107,0.666667,...,0.344740,-1.025013e-01,1.035779,1.316981,-1.551141,-1.076019,-0.869749,0.386880,0.042834,-0.151715
32615,0.144356,0.144025,0.464527,-0.267957,-1.059816e+00,-5.953132e-01,0.856000,0.620273,-0.878925,0.000000,...,-0.699754,-4.578195e-01,-0.340680,-0.373311,-1.198956,-1.124047,-0.869749,-0.721364,-0.435742,0.035220
33695,-0.598418,-0.104735,0.464527,-0.812525,-2.733173e-02,5.124922e-01,-0.636381,-0.032421,-0.792531,1.000000,...,-0.064865,-2.576212e-16,-1.137097,-0.091596,-0.098377,0.247482,-0.185367,-0.636114,-0.056071,-0.377711
32739,0.872953,0.641547,0.902258,-0.104587,-3.714932e-01,-8.414921e-01,0.358539,0.620273,-1.255880,1.000000,...,-0.177507,-4.686966e-01,-1.273554,-0.655026,0.209785,-0.006290,-1.326004,-0.238283,-0.716507,0.077071
50130,0.195053,0.144025,0.026796,0.331067,2.021093e-01,7.586712e-01,-0.304741,-0.685115,-1.315065,1.000000,...,0.672425,-6.209758e-01,-1.480560,-0.936741,-0.098377,0.115965,-0.299430,0.472130,0.729250,0.096601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42558,1.082153,0.144025,-0.848667,0.439981,-1.518698e+00,-1.087671e+00,0.524359,0.130753,-0.537771,1.000000,...,-0.310629,-5.883446e-01,0.142833,1.035266,-0.230447,-0.026665,0.327920,-0.238283,0.178963,-0.578596
24938,1.634424,-0.851017,-0.410936,-0.267957,5.462708e-01,1.127940e+00,-0.470561,-0.521942,-0.047699,0.333333,...,1.942202,-3.816799e-01,0.063177,-0.091596,1.486456,1.571267,-0.299430,1.409875,0.584309,-1.033378
2481,1.198090,-0.353496,0.245661,0.004327,-6.009343e-01,-1.029552e-01,-0.802201,-0.195595,-1.043633,1.000000,...,-0.361830,-5.593390e-01,1.495494,0.471835,-0.934817,-0.717298,0.099792,-0.479823,-0.014595,-0.453043
36815,-2.509024,0.144025,0.245661,-1.030352,-4.862138e-01,-2.260447e-01,0.026899,-0.358768,-0.666667,0.000000,...,0.518823,-3.889313e-01,1.584026,2.162127,-1.198956,-1.024672,-1.097876,2.830700,1.936083,-1.158931


In [27]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train_joined, y_train.values.ravel())
y_pred_tree = tree.predict(x_test_joined)

print("Decision Tree Classifier Evaluation:")
print("AUC:", roc_auc_score(y_test, y_pred_tree))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_tree))

Decision Tree Classifier Evaluation:
AUC: 0.5889007617506961
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.46      0.45      5022
           1       0.73      0.72      0.73     10254

    accuracy                           0.63     15276
   macro avg       0.59      0.59      0.59     15276
weighted avg       0.64      0.63      0.64     15276



In [28]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(x_train_joined, y_train.values.ravel())
y_pred_forest = forest.predict(x_test_joined)

print("Random Forest Classifier Evaluation:")
print("AUC:", roc_auc_score(y_test, y_pred_forest))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_forest))

Random Forest Classifier Evaluation:
AUC: 0.6391056064841905
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.37      0.48      5022
           1       0.75      0.91      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.64      0.65     15276
weighted avg       0.72      0.73      0.71     15276



In [29]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(x_train_joined, y_train.values.ravel())
y_pred_svm = svm.predict(x_test_joined)

print("SVM Classifier Evaluation:")
print("AUC:", roc_auc_score(y_test, y_pred_svm))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_svm))

SVM Classifier Evaluation:
AUC: 0.6254836239562892
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.34      0.44      5022
           1       0.74      0.92      0.82     10254

    accuracy                           0.72     15276
   macro avg       0.70      0.63      0.63     15276
weighted avg       0.71      0.72      0.69     15276



In [30]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(x_train_joined, y_train.values.ravel())
y_pred_svm_rbf = svm_rbf.predict(x_test_joined)

print("SVM Classifier Evaluation:")
print("AUC:", roc_auc_score(y_test, y_pred_svm_rbf))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_svm_rbf))

SVM Classifier Evaluation:
AUC: 0.6388857429883119
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.36      0.47      5022
           1       0.75      0.92      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.72      0.64      0.65     15276
weighted avg       0.73      0.73      0.71     15276



In [38]:
from xgboost import XGBClassifier as XGBC

parameters = {
    'n_estimators':np.arange(start=2,stop=20,step=2),
    'max_depth':np.arange(start=1,stop=6,step=1),
    'learning_rate':np.arange(start=0.05,stop=0.4,step=0.05)}


stratifiedCV = StratifiedKFold(n_splits=8)
# XGBC: XGBoost classifier
XGBoostModel = XGBC()
BestXGBoost = GridSearchCV(
    XGBoostModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestXGBoost.fit(x_train_joined,y_train.values.ravel())

Fitting 8 folds for each of 315 candidates, totalling 2520 fits


In [41]:
y_pred_xgboost = BestXGBoost.predict(x_test_joined)

print("GBM:")
print("AUC:", roc_auc_score(y_test, y_pred_xgboost))
print("Classification Report:\n", metrics.classification_report(y_test,y_pred_xgboost))

GBM:
AUC: 0.6327375463699919
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.35      0.46      5022
           1       0.74      0.91      0.82     10254

    accuracy                           0.73     15276
   macro avg       0.70      0.63      0.64     15276
weighted avg       0.72      0.73      0.70     15276

