In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, make_scorer
from imblearn.metrics import geometric_mean_score

In [2]:
data_path = Path(os.getcwd()).parent.parent / "data" / "dataset_diabetes"
df = pd.read_csv(data_path / "diabetic_preprocessed.csv")

In [3]:
df["age"] = df["age_all"]

columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all']

df_for_experimenting = df.drop(columns=columns_to_remove)

In [4]:
target_variable = "readmit_30_days"
Y= df_for_experimenting.loc[:, target_variable]
X = pd.get_dummies(df_for_experimenting.drop(columns=["readmit_30_days"]))

In [5]:
X.head() # sanity check

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Caucasian,...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,1,41,0,1,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,1,0
1,3,59,0,18,0,0,0,9,0,1,...,1,0,1,0,1,0,1,0,0,1
2,2,11,5,13,2,0,1,6,1,0,...,1,0,1,0,1,0,0,1,0,1
3,2,44,1,16,0,0,0,7,0,1,...,1,0,1,0,1,0,1,0,0,1
4,1,51,0,8,0,0,0,5,0,1,...,1,0,1,0,1,0,1,0,0,1


In [6]:
random_seed = 445
np.random.seed(random_seed)

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.20,
    stratify=Y,
    random_state=random_seed
)

### Multi-layer perceptron

In [7]:
mlp = MLPClassifier()

In [8]:
#Creating a dictionary grid for grid search
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
}

f_two_scorer = make_scorer(fbeta_score, beta=2)

#Fitting grid search to the train data with 5 folds
grid_search = GridSearchCV(estimator=mlp, 
                          param_grid=param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring=f_two_scorer,
                          verbose=True).fit(X_train, Y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits




In [9]:
# Best paramete set
print('Best parameters found:\n', grid_search.best_params_)

# All results
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'adam'}
0.000 (+/-0.000) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'solver': 'sgd'}
0.062 (+/-0.016) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'solver': 'adam'}
0.000 (+/-0.000) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'sgd'}
0.057 (+/-0.010) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'adam'}
0.012 (+/-0.005) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'sgd'}
0.077 (+/-0.010) for {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'adam'}
0.000 (+/-0.000) for {'activation': 'logistic', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50, 50), 'solver': 'sgd'}
0.046 (+/-0.017) for {'activation': 'logistic', 'alpha': 0.001, 

In [15]:
optimal_mlp = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 100, 50), solver='adam')
optimal_mlp.fit(X_train, Y_train)

pred_test = optimal_mlp.predict(X_test)



In [16]:
print('The balanced accuracy score for the testing data:', balanced_accuracy_score(Y_test, pred_test))
print('The precision score for the testing data:', precision_score(Y_test, pred_test))
print('The recall score for the testing data:', recall_score(Y_test, pred_test))
print('The F1 score for the testing data:', f1_score(Y_test, pred_test))
print('The F2 score for the testing data:', fbeta_score(Y_test, pred_test, beta=2))
print('The G mean score for the testing data:', geometric_mean_score(Y_test, pred_test))

#Ploting the confusion matrix
confusion_matrix(Y_test, pred_test)

The balanced accuracy score for the testing data: 0.5141361986597481
The precision score for the testing data: 0.1637694419030192
The recall score for the testing data: 0.07881990312637605
The F1 score for the testing data: 0.10642092746730084
The F2 score for the testing data: 0.08794340178834627
The G mean score for the testing data: 0.27356124289708483


array([[17168,   914],
       [ 2092,   179]], dtype=int64)