In [39]:
# importing required libraries
import numpy as np  # nd-arrays
import pandas as pd  # read data from datasources
from sklearn.preprocessing import LabelEncoder  # Encode categorical variables
from sklearn.preprocessing import StandardScaler  # Standardize the data using mean and std
from sklearn.model_selection import train_test_split  # split the data into train and test
from sklearn.metrics import accuracy_score, confusion_matrix  # evaluate a model
from sklearn.neural_network import MLPClassifier  # build a model using MLP
from sklearn.model_selection import GridSearchCV  # tune the hyperparameters
from sklearn.metrics import average_precision_score, make_scorer, recall_score  # custom scoring functions


In [40]:
# Reading the data into dataframe
diabetes_df = pd.read_excel('data_file.xlsx')
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2703 entries, 0 to 2702
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GENDER    2703 non-null   int64  
 1   AGE       2703 non-null   int64  
 2   Height    2703 non-null   int64  
 3   Weight    2703 non-null   float64
 4   BMI       2703 non-null   float64
 5   BAI       2703 non-null   float64
 6   HBA1C1    2703 non-null   float64
 7   OGTT1FBS  2703 non-null   int64  
 8   NDD       2703 non-null   int64  
dtypes: float64(4), int64(5)
memory usage: 190.2 KB


In [41]:
# Work on a copy of the dataset
clean_df = diabetes_df.copy()

# Prune the data by removing duplicates
clean_df.drop_duplicates(keep='first', inplace=True)

print(clean_df.shape)

(1065, 9)


In [42]:
clean_df.head(1)


Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,HBA1C1,OGTT1FBS,NDD
0,1,37,156,88.0,36.160421,41.53,5.1,102,0


In [43]:
# Assign labels with appropriate numerics
nondia = -1
diabetic = 1

# Create a column to indicate diabetic status based on NDD
Ynew = pd.DataFrame(np.where(clean_df['NDD'] == 1, diabetic, nondia), index=clean_df.index, columns=['diabetic'])

In [44]:
# Identify the diabetic status of each record using the blood test results of FBS or HBA1C1
Ynew.loc[(clean_df.OGTT1FBS >= 126) | (clean_df.HBA1C1 >= 6.5), 'diabetic'] = diabetic

# Concatenate the diabetic status with the anthropometric features of the dataset
data_df = pd.concat([clean_df.iloc[:, :6], Ynew], axis=1)

In [45]:
data_df.head(1)


Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
0,1,37,156,88.0,36.160421,41.53,-1


In [46]:
# Find the diabetic and non-diabetic patients
diabetic_yes = data_df[data_df.diabetic == diabetic]
diabetic_no = data_df[data_df.diabetic == nondia]

In [47]:
diabetic_yes.describe()


Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,0.566547,51.624101,160.210432,68.321942,26.628727,29.641799,1.0
std,0.495998,10.930447,7.564781,12.714347,4.776916,7.802823,0.0
min,0.0,25.0,138.0,35.0,13.671875,8.3,1.0
25%,0.0,43.0,156.0,59.75,23.434014,24.79,1.0
50%,1.0,50.0,159.0,68.0,26.5625,28.145,1.0
75%,1.0,59.0,165.0,76.0,29.585799,33.8725,1.0
max,1.0,80.0,186.0,102.0,41.207076,58.25,1.0


In [48]:
diabetic_no.describe()

Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
count,509.0,509.0,509.0,509.0,509.0,509.0,509.0
mean,0.449902,44.265226,158.180747,68.411591,27.307721,31.775776,-1.0
std,0.497973,11.931332,7.95229,13.186206,4.798944,8.245442,0.0
min,0.0,20.0,139.0,33.5,14.888889,14.08,-1.0
25%,0.0,35.0,153.0,61.0,24.508946,25.65,-1.0
50%,0.0,44.0,158.0,68.0,26.95984,30.34,-1.0
75%,1.0,52.0,162.0,76.0,30.043262,38.26,-1.0
max,1.0,84.0,186.0,110.0,50.219138,59.76,-1.0


In [49]:
train_x, test_x, train_y, test_y = train_test_split(data_df.iloc[:,:6],
 data_df.diabetic,
 test_size=0.3, random_state=43)

In [50]:
sc = StandardScaler()  # creating an instance for StandardScaler class
train_x = sc.fit_transform(train_x)  # estimate mu and sigma for train set and transform
test_x = sc.transform(test_x)  # transform the test set

In [51]:
# fetch the diabetic records from train set
diabetic_yes_train = train_x[list(np.where(train_y == diabetic)[0])]
# fetch the non-diabetic records from train set
diabetic_no_train = train_x[list(np.where(train_y == nondia)[0])]
# display the counts for each class
print('non-diabetic=', diabetic_no_train.shape, 'diabetic=', diabetic_yes_train.shape)

non-diabetic= (347, 6) diabetic= (398, 6)


In [52]:
# fetch the diabetic records from test set
diabetic_yes_test = test_x[list(np.where(test_y == diabetic)[0])]
# fetch the non-diabetic records from test set
diabetic_no_test = test_x[list(np.where(test_y == nondia)[0])]
# display the counts for each class from test set
print('non-diabetic=', diabetic_no_test.shape, 'diabetic=', diabetic_yes_test.shape)

non-diabetic= (162, 6) diabetic= (158, 6)


In [53]:
# Evaluate a model using confusion matrix and accuracy score between true and actual
def evaluate(yt, yp):
    cf = confusion_matrix(yt, yp)  # estimate confusion matrix
    acc = accuracy_score(yt, yp)   # estimate accuracy of the model
    return cf, acc


In [54]:
# Display metrics
def display(yt, yp, model):
    cf, acc = evaluate(yt, yp)
    print('Model =', model, '\ncf =', cf, '\nacc =', acc, '\n')

In [55]:
# Perform Classification using MLP Classifier
mlpc = MLPClassifier(hidden_layer_sizes=1, activation='tanh',
                    learning_rate='invscaling', max_iter=10000,
                    solver='sgd', random_state=0, early_stopping=True)  # create a MLPClassifier instance
mlpc.fit(train_x, train_y)  # fit the model for trainset
train_yp = mlpc.predict(train_x)  # predict the y for train set
test_yp = mlpc.predict(test_x)  # predict the y for test set


In [56]:
# display the results
display(train_y,train_yp,'MLP: Training')
display(test_y,test_yp,'MLP: Testing')

Model = MLP: Training 
cf = [[  0 347]
 [  0 398]] 
acc = 0.5342281879194631 

Model = MLP: Testing 
cf = [[  0 162]
 [  0 158]] 
acc = 0.49375 



In [57]:
mlpc.classes_


array([-1,  1])

In [58]:
mlpc.loss_


np.float64(0.8298330075915282)

In [59]:
mlpc.coefs_


[array([[ 0.09045038],
        [ 0.39957265],
        [ 0.19025382],
        [ 0.08227145],
        [-0.14222192],
        [ 0.26913367]]),
 array([[1.35776874]])]

In [60]:
mlpc.intercepts_


[array([-0.11892174]), array([1.6030463])]

In [61]:
mlpc.n_layers_


3

In [62]:
mlpc.n_iter_


12

In [63]:
mlpc.n_outputs_


1

In [64]:
mlpc.out_activation_


'logistic'

In [65]:
 # recall = tp / (tp + fn) = Sensitivity or True Positive Rate / True Negative Rate
 # precision = tp / (tp + fp) = Positive predictive value
custom_scorer = {'recall':make_scorer(recall_score, pos_label=diabetic),
 'precision':make_scorer(average_precision_score, pos_label=diabetic)}

In [66]:
gscv = GridSearchCV(MLPClassifier(max_iter=10000,random_state=0),
                    { 'activation':('tanh','logistic','relu'),
 'hidden_layer_sizes':range(1,4,1),'solver':['adam','sgd']}, 
cv=5,verbose=False,
 scoring=custom_scorer,refit='recall')
gscv.fit(train_x,train_y)
gscv.best_params_

{'activation': 'logistic', 'hidden_layer_sizes': 1, 'solver': 'adam'}

In [67]:
 #Perform Classification using MLP Classifier
mlpc = MLPClassifier(hidden_layer_sizes=(1),activation='logistic',
 max_iter=10000,
 solver='adam',
 random_state=0) # create a MLPClassifier instance
mlpc.fit(train_x, train_y) # fit the model for trainset
train_yp=mlpc.predict(train_x) # predict the y for train set
test_yp=mlpc.predict(test_x) # predict the y for test set
 # display the results
display(train_y,train_yp,'MLP with "sgd" solver and 1,4 hidden nodes')
display(test_y,test_yp,'For Testing')

Model = MLP with "sgd" solver and 1,4 hidden nodes 
cf = [[  0 347]
 [  0 398]] 
acc = 0.5342281879194631 

Model = For Testing 
cf = [[  0 162]
 [  0 158]] 
acc = 0.49375 



In [68]:
mlpc.coefs_


[array([[ 0.42498821],
        [ 1.14677073],
        [ 0.43369027],
        [-0.09980984],
        [-0.32748723],
        [-0.30029026]]),
 array([[0.54230605]])]

In [69]:
mlpc.intercepts_


[array([-0.53333804]), array([0.04664223])]

In [71]:
mlpc.score(test_x, test_y)


0.49375

## Improving MLP Model Performance

The current accuracy is low. Let's try the following strategies:
- Tune hyperparameters using GridSearchCV
- Try different activation functions, solvers, and hidden layer sizes
- Evaluate with confusion matrix and accuracy
- Optionally, try other classifiers for comparison

In [72]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(1,), (2,), (3,), (4,), (5,), (10,), (20,)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [1000, 5000, 10000]
}
gscv = GridSearchCV(MLPClassifier(random_state=0), param_grid, cv=5, scoring='accuracy', verbose=2)
gscv.fit(train_x, train_y)
print('Best parameters:', gscv.best_params_)
print('Best cross-validated score:', gscv.best_score_)

# Evaluate best model on test set
best_mlp = gscv.best_estimator_
test_pred = best_mlp.predict(test_x)
from sklearn.metrics import confusion_matrix, accuracy_score
print('Test accuracy:', accuracy_score(test_y, test_pred))
print('Confusion matrix:\n', confusion_matrix(test_y, test_pred))

Fitting 5 folds for each of 378 candidates, totalling 1890 fits
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(1,), learning_rate=constant, max_iter=



[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   1.0s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.9s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   1.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.7s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.7s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=t



[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.9s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.9s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.9s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.7s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.8s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] 



[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.9s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.9s




[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.9s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.7s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.7s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=tanh, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=t



[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.8s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=adam; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=constant, max_iter=5000, solver=adam; total time=   0.2s
[CV] END activation=r



[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.8s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=adam; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=1000, solver=sgd; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=invscaling, max_iter=5000, solver=adam; total time=   0.2s
[CV] 



[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.8s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=adam; total time=   0.0s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.2s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=1000, solver=sgd; total time=   0.1s
[CV] END activation=relu, hidden_layer_sizes=(20,), learning_rate=adaptive, max_iter=5000, solver=adam; total time=   0.2s
[CV] END activation=r