## 0. Imports, Config, and Loading dataset

In [161]:
# config so we can easily change
balance_treatment_type = 'oversampling' # 'undersampling', or 'oversampling'
target_col = 'diabetes'
random_state = 42
dataset_test_size = 0.2
learning_rate = 0.01
epochs = 500
batch_size = 128
loss_function = 'mse' # 'binary_crossentropy', 'categorical_crossentropy', 'sparse_categorical_crossentropy', 'mse', 'mae', 'hinge'
metrics = ['accuracy'] # 'accuracy', 'precision', 'recall', 'f1_score'
n_neurons = 5
activation_function = 'relu'
optimizer_type = 'adam'  # 'adam', 'sgd', 'rmsprop'

# Early stopping configuration
early_stopping_patience = 20 # number of epochs to wait before stopping the training
min_improvement = 0.02 # should improve at least 2% in the validation loss to continue training
early_stopping_monitor = 'accuracy'


In [162]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from utils.early_stopping import CustomEarlyStopping
import pandas as pd

In [163]:
file_name = "diabetes_prediction_dataset.csv"

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "iammustafatz/diabetes-prediction-dataset",
    file_name,
)

If you want to see details about exploratory data analysis, please take a look in the file [exploratory_data_analysis.ipynb](exploratory_data_analysis.ipynb) instead, because this file will be focused in get a baselines model.

## 1. Cleaning the dataset

In [164]:
df.isnull().sum() # we don't have null values in the dataset, so we don't need to handle them

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [165]:
# Convert the column "blood_glucose_level" to float
df['blood_glucose_level'] = df['blood_glucose_level'].astype(float)

In [166]:
# Transform feature gender in binary (Male == 1, Female = 0)
def is_male(gender: str):
    return 1 if gender.strip().lower() == 'male' else 0
df['gender'] = df['gender'].map(is_male)

In [167]:
# Convert the info smoking_history in information if the patient is a smoker or not (binary) 
df['is_smoker'] = df['smoking_history'].map({'never': 0, 'No Info': 0, 'current': 1, 'former': 1, 'ever': 1, 'not current': 0})
# Remove the column "age" from the dataset
df = df.drop(columns=['smoking_history'])

In [168]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,is_smoker
0,0,80.0,0,1,25.19,6.6,140.0,0,0
1,0,54.0,0,0,27.32,6.6,80.0,0,0
2,1,28.0,0,0,27.32,5.7,158.0,0,0
3,0,36.0,0,0,23.45,5.0,155.0,0,1
4,1,76.0,1,1,20.14,4.8,155.0,0,1
...,...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,27.32,6.2,90.0,0,0
99996,0,2.0,0,0,17.37,6.5,100.0,0,0
99997,1,66.0,0,0,27.83,5.7,155.0,0,1
99998,0,24.0,0,0,35.42,4.0,100.0,0,0


In [169]:
df.dtypes # all the data is in the correct format to proceed with the model

gender                   int64
age                    float64
hypertension             int64
heart_disease            int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level    float64
diabetes                 int64
is_smoker                int64
dtype: object

## 2. Normalizing the features

In [170]:
# Apply normalization to the continuous features to improve the model performance
continuous_features = ['age', 'blood_glucose_level', 'bmi', 'HbA1c_level']

scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

In [171]:
df[continuous_features].describe() # normalization was applied correctly

Unnamed: 0,age,blood_glucose_level,bmi,HbA1c_level
count,100000.0,100000.0,100000.0,100000.0
mean,1.433875e-16,-2.842171e-16,-1.70246e-16,-2.415845e-18
std,1.000005,1.000005,1.000005,1.000005
min,-1.856658,-1.42621,-2.60832,-1.893686
25%,-0.7943364,-0.9349053,-0.5561106,-0.6794897
50%,0.04948073,0.04770422,-0.0001155837,0.2545078
75%,0.804475,0.5144437,0.3404125,0.6281067
max,1.692704,3.978142,10.30161,3.2433


## 3. Treating the desbalanceament

In [172]:
def apply_undersampling(df: pd.DataFrame) -> pd.DataFrame:
    # separate the classes
    df_majority = df[df[target_col] == 0]
    df_minority = df[df[target_col] == 1]

    # undersample the majority class
    df_majority_downsampled = resample(
        df_majority,
        replace=False,
        n_samples=len(df_minority),
        random_state=random_state
    )
    
    # combine the downsampled majority class with the minority class and return the new balanced dataset
    return pd.concat([df_majority_downsampled, df_minority])

def apply_oversampling(df: pd.DataFrame) -> pd.DataFrame:
    # separate the classes
    df_majority = df[df[target_col] == 0]
    df_minority = df[df[target_col] == 1]

    # oversample the minority class
    df_minority_oversampled = resample(
        df_minority,
        replace=True,
        n_samples=len(df_majority),
        random_state=random_state
    )
    
    # combine the oversampled minority class with the majority class and return the new balanced dataset
    return pd.concat([df_majority, df_minority_oversampled])
    
if balance_treatment_type == 'undersampling':
    df = apply_undersampling(df)
elif balance_treatment_type == 'oversampling':
    df = apply_oversampling(df)
else:
    raise ValueError(f"Invalid balance treatment type: {balance_treatment_type}")

In [173]:
print('Number of diabetes 0: ', df[target_col].value_counts()[0])
print('Number of diabetes 1: ', df[target_col].value_counts()[1])

Number of diabetes 0:  91500
Number of diabetes 1:  91500


4. Separate the dataset in train/test

In [174]:
# Separate features and target
X = df.drop(columns=[target_col], axis=1)
y = df[target_col]

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=dataset_test_size, random_state=random_state)

## 5. Creating Baseline Neural Network

In [175]:
# Define the optimizer
optimizer = 0
if optimizer_type == 'adam':
    optimizer = Adam(learning_rate=learning_rate)
elif optimizer_type == 'sgd':
    optimizer = SGD(learning_rate=learning_rate)
elif optimizer_type == 'rmsprop':
    optimizer = RMSprop(learning_rate=learning_rate)
else:
    raise ValueError(f"Invalid optimizer type: {optimizer_type}")

In [176]:
# Create the model with the number of neurons in the input layer
model = Sequential([
    Dense(n_neurons, activation=activation_function, input_shape=(X_train.shape[1],)),
    Dense(1, activation='sigmoid')
])

# Compile the model with the optimizer, loss function and metrics
model.compile(
    optimizer=optimizer,
    loss=loss_function,
    metrics=metrics
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [177]:
# Early stopping callback
early_stopping = CustomEarlyStopping(
    monitor=early_stopping_monitor,
    min_improvement=min_improvement,
    patience=early_stopping_patience
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping]
)

Epoch 1/500
[1m1106/1144[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 786us/step - accuracy: 0.8642 - loss: 0.0952Epoch 1: ✅ Improvement detected! accuracy: 0.8838 (best: 0.8838)
[1m1144/1144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 786us/step - accuracy: 0.8838 - loss: 0.0813
Epoch 2/500
[1m1068/1144[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 565us/step - accuracy: 0.8963 - loss: 0.0713Epoch 2: ❌ Insufficient improvement. accuracy: 0.8969 (needs: 0.9038) Wait: 1/20
[1m1144/1144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 571us/step - accuracy: 0.8969 - loss: 0.0707
Epoch 3/500
[1m1083/1144[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 511us/step - accuracy: 0.8996 - loss: 0.0690Epoch 3: ❌ Insufficient improvement. accuracy: 0.8981 (needs: 0.9038) Wait: 2/20
[1m1144/1144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 520us/step - accuracy: 0.8981 - loss: 0.0699
Epoch 4/500
[1m1086/1144[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m

## 6. Analyzing the Baseline neural network metrics

In [178]:
accuracy = history.history['accuracy'][-1]
print(f"Accuracy: {(accuracy * 100):.2f}%")

Accuracy: 90.69%
