## Introduction

Professor provided the dataset in the file diabetes.csv altho I found the [same dataset here](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database/data).


In [371]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input # dense is a fully connected layer
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split, GridSearchCV
from scikeras.wrappers import KerasClassifier

In [372]:
data = pd.read_csv('./data/diabetes.csv')
print(data.isnull().sum())
data.head(5)


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [373]:
Y = data['Outcome']
X = data.drop('Outcome', axis=1)

In [374]:
col_names = X.columns.values.tolist()
print(f"{(Y==0).sum()} Non-Diabetics")
print(f"{(Y==1).sum()} Diabetics")

500 Non-Diabetics
268 Diabetics


In [375]:
sscaler = StandardScaler()
mmscaler = MinMaxScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('Rest', sscaler, col_names[:-1]),
        ('Age', mmscaler, ['Age'])
    ],
    verbose_feature_names_out = False
)
col_names[:-1]

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction']

## MLP to classify the data

In [376]:
np.random.seed(123)  # for reproducibility
tf.random.set_seed(123)
input_shape= X.shape[1]
print(f"input_shape: {input_shape}, num_classes: {len(np.unique(Y))}")


input_shape: 8, num_classes: 2


In [377]:
def create_model(
    nhid1, nhid2,
    learning_rate= 10**-2,
    loss='BinaryCrossentropy',
    hid_act='relu',
    out_act='sigmoid',
    dropout_rate=0,
    weight_reg=None
    # nhid1: number of hidden neurons in the first hidden layer
    # nhid2: number of hidden neurons in the second hidden layer
    # learning_rate: the learning rate to be used by the optimizer
    # loss: loss function to be used
    # hid_act: activation function for hidden layers
    # out_act: activation function for output layer
    # dropout_rate: the rate of dropout to be used
    ):
        model = Sequential()
        model.add(Input(shape=(input_shape,)))  # Explicit Input layer

        model.add(Dense( # First hidden layer
                    nhid1,
                    activation=hid_act,
                    kernel_regularizer=weight_reg))
        
        # set dropout regularization
        model.add(Dropout(dropout_rate))
        model.add(Dense(nhid2, # Second hidden layer
                        activation=hid_act,
                        kernel_regularizer=weight_reg))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation=out_act)) # Output layer
        model.compile(loss=loss,
                optimizer=SGD(learning_rate=learning_rate),
                metrics=['accuracy'])
        
        return model


In [422]:
# Wrapper class for scikit-learn API
model = KerasClassifier(
    model=create_model,
    nhid1=64, # These are arguments to create_model
    nhid2=32,
    epochs=50)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.1,shuffle=True, stratify=Y)

X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=col_names)
X_test  = pd.DataFrame(preprocessor.transform(X_test),columns=col_names)

In [424]:
# These are the scikit-learn fit and predict methods.
# Learning the model on all data:
from sklearn.metrics import f1_score, accuracy_score

model.fit(X_train, Y_train, verbose=0) # Delete verbose=0 to see the training metrics per epoch
pred = model.predict(X_test)

threshold = 0.5
predicted_labels = (pred >= threshold).astype(int)
# Test performance
print(f"Accuracy: {accuracy_score(Y_test, pred)}")
print(f"F1: {f1_score(Y_test, pred)}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Accuracy: 0.8441558441558441
F1: 0.7692307692307693


## The Optimizer

In [432]:
model = KerasClassifier(
            model=create_model,
            nhid1=100, # These are arguments to create_model
            nhid2=50,
            epochs=50)

nhid1 = [64, 32]
nhid2 = [32, 16]
lr = [10**-2, 10**-1]

weight_reg = [None, 'l2']
hid_act = ['relu'] # not tuned to reduce the number of combinations
batch_size = [32] # not tuned to reduce the number of combinations
dropout = [0]
loss = ['BinaryCrossentropy'] # not tuned to reduce the number of combinations

# Dictionary names must start with model__, if it is an argument of the model, followed by the model argument name.
param_grid = dict(
                model__nhid1= nhid1,
                model__nhid2= nhid2,
                model__learning_rate= lr,
                model__weight_reg= weight_reg,
                model__hid_act= hid_act,
                batch_size=batch_size,
                model__dropout_rate= dropout,
                model__loss= loss,
                )

## Model Selection

In [433]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, shuffle=True, stratify=Y)

X_train = pd.DataFrame(preprocessor.transform(X_train) ,columns=col_names)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=col_names)

preprocessor.fit(X_train)

GS = GridSearchCV(
            estimator=model, 
            param_grid=param_grid,
            n_jobs=1,
            scoring='f1',
            cv=3,
            verbose=0
            )

grid_result = GS.fit(X_train, Y_train, verbose=0) 

# Best result
print("Best score: %f using params: %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']

# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4m

In [None]:
print("Holdout test performance")
model = GS.best_estimator_
Y_test_predicted = model.predict(X_test)
threshold = 0.5
Y_test_predicted = (Y_test_predicted >= threshold).astype(int)
print(f"test  F1:{f1_score(Y_test, Y_test_predicted)}")
print(f"test  Accuracy:{accuracy_score(Y_test, Y_test_predicted)}")


Holdout test performance
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
test  F1:0.6153846153846154
test  Accuracy:0.7844827586206896
