# BASELINE: MULTILAYER PERCEPTRON (MLP)

## 1. PREPROCESSING

In [3]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pointbiserialr
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
import matplotlib.pyplot as plt
from model_functions import preprocess_diabetes

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, recall_score

In [4]:
file = 'data/diabetes.csv'

df = pd.read_csv(file)
prep_data = preprocess_diabetes(df)

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]

Selected 18 features from 21 (threshold=0.05)


## 2. Hyperparameters tunning

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ----- Pipeline -----
# (Scaling recommended for MLP)
pipeline = Pipeline([               
    ('mlp', MLPClassifier(max_iter=2000))        # raise max_iter to avoid convergence warnings
])

# ----- Hyperparameter grid -----
param_grid = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'mlp__activation': ['relu', 'tanh'],           
    'mlp__solver': ['adam'],                       
    'mlp__alpha': [1e-4, 1e-3],                    
    'mlp__learning_rate': ['constant'],            
    'mlp__learning_rate_init': [0.001, 0.01]       
}

scoring = {
    "accuracy": "accuracy",
    "recall": make_scorer(recall_score, pos_label=1)
}

# ----- GridSearch -----
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,     # you can use f1, roc_auc, etc.
    refit='recall',
    cv=5,
    n_jobs=-1
)

# ----- Fit -----
grid.fit(X_train, y_train)

# ----- Results -----
print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)
print("Test accuracy:", grid.score(X_test, y_test))


Best params: {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (50,), 'mlp__learning_rate': 'constant', 'mlp__learning_rate_init': 0.01, 'mlp__solver': 'adam'}
Best CV score: 0.8543703610603576
Test accuracy: 0.7616353090960531


Best params: {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (50,), 'mlp__learning_rate': 'constant', 'mlp__learning_rate_init': 0.01, 'mlp__solver': 'adam'}
Best CV score: 0.8543703610603576
Test accuracy: 0.7616353090960531

## 3. MLP Function

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
import numpy as np

def mlp_cross_validation(X, y,
                         hidden_layer_sizes=(100,50),
                         activation='logistic',
                         solver='adam',
                         alpha=0.01,
                         learning_rate='constant',
                         learning_rate_init=0.1,
                         max_iter=2000,
                         k=30):
    """
    Manual Stratified K-fold cross-validation for an MLP classifier.
    """

    # Convert labels to numeric if string-based
    if y.dtype == object or isinstance(y[0], str):
        y = np.where(y == "Malignant", 1, 0)

    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    accuracies, f1s, recalls, aucs = [], [], [], []
    all_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold+1}/{k}")

        # ---- SPLIT (pandas OR numpy) ----
        if hasattr(X, "iloc"):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
        else:
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

        # ---- SCALING (inside fold, avoid leakage) ----
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # ---- MODEL ----
        mlp = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            alpha=alpha,
            learning_rate=learning_rate,
            learning_rate_init=learning_rate_init,
            max_iter=max_iter
        )

        mlp.fit(X_train, y_train)

        # ---- PREDICT ----
        y_pred = mlp.predict(X_val)
        y_proba = mlp.predict_proba(X_val)[:, 1]

        # ---- METRICS ----
        accuracies.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred, pos_label=1))
        recalls.append(recall_score(y_val, y_pred, pos_label=1))
        aucs.append(roc_auc_score(y_val, y_proba))

        all_preds.extend(y_pred)

    metrics = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "auc_mean": np.mean(aucs),
        "auc_std": np.std(aucs),
    }

    return metrics, np.array(all_preds)


In [8]:
metrics, preds = mlp_cross_validation(
    X_train, y_train.values,
    hidden_layer_sizes=(100,50),
    activation='relu',
    k=30
)

print(metrics)


Fold 1/30
Fold 2/30
Fold 3/30
Fold 4/30
Fold 5/30
Fold 6/30
Fold 7/30
Fold 8/30
Fold 9/30
Fold 10/30
Fold 11/30
Fold 12/30
Fold 13/30
Fold 14/30
Fold 15/30
Fold 16/30
Fold 17/30
Fold 18/30
Fold 19/30
Fold 20/30
Fold 21/30
Fold 22/30
Fold 23/30
Fold 24/30
Fold 25/30
Fold 26/30
Fold 27/30
Fold 28/30
Fold 29/30
Fold 30/30
{'accuracy_mean': np.float64(0.965), 'accuracy_std': np.float64(0.0781706348199127), 'f1_mean': np.float64(0.9579998704998705), 'f1_std': np.float64(0.08053589659898143), 'recall_mean': np.float64(0.9655555555555556), 'recall_std': np.float64(0.06911254017815105), 'auc_mean': np.float64(0.9924691358024691), 'auc_std': np.float64(0.023555348499232322)}
