# BASELINE: MULTILAYER PERCEPTRON (MLP)

## 1. PREPROCESSING

In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pointbiserialr
from sklearn.metrics import make_scorer, recall_score, confusion_matrix


In [13]:
def preprocess_data(
    file='data/wdbc.data',
    corr_threshold=0.3,
    test_size=0.2,
    random_state=42
):
    # -------------------------------------
    # 1) Load dataset and rename columns
    # -------------------------------------
    df = pd.read_csv(file, header=None)
    cols = ["id", "diagnosis"] + [
        f"{feat}_{stat}"
        for stat in ["mean", "se", "worst"]
        for feat in ["radius", "texture", "perimeter", "area",
                     "smoothness", "compactness", "concavity",
                     "concave_points", "symmetry", "fractal_dimension"]
    ]
    df.columns = cols

    # -------------------------------------
    # 2) Numerical encoding for correlation filtering
    # -------------------------------------
    df["diagnosis"] = df["diagnosis"].map({"B": 0.0, "M": 1.0})

    correlated_columns = ["diagnosis"]
    for col in df.drop(columns=["diagnosis", "id"]).columns:
        r, _ = pointbiserialr(df["diagnosis"], df[col].astype(float))
        if r > corr_threshold:
            correlated_columns.append(col)

    df = df[correlated_columns]

    # Convert back to category labels
    df["diagnosis"] = df["diagnosis"].map({0.0: "Benign", 1.0: "Malignant"}).astype("category")

    # -------------------------------------
    # 3) Split into X and y
    # -------------------------------------
    X = df.drop(columns=["diagnosis"])
    y = df["diagnosis"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    # -------------------------------------
    # 4) Scaling
    # -------------------------------------
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # -------------------------------------
    # 5) PCA
    # -------------------------------------
    pca = PCA(n_components= 5)
    pca.fit(X_train_scaled)

    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca  = pca.transform(X_test_scaled)

    # -------------------------------------
    # 6) Return everything useful
    # -------------------------------------
    return {
        "df": df,                            
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
        "X_train_scaled": X_train_scaled,
        "X_test_scaled": X_test_scaled,
        "X_train_pca": X_train_pca,
        "X_test_pca": X_test_pca
    }


In [14]:
prep_data = preprocess_data()

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]
X_train_scaled = prep_data["X_train_scaled"]
X_test_scaled = prep_data["X_test_scaled"]
X_train_pca = prep_data["X_train_pca"]
X_test_pca = prep_data["X_test_pca"]

## 2. Hyperparameters tunning

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ----- Pipeline -----
# (Scaling recommended for MLP)
pipeline = Pipeline([               
    ('mlp', MLPClassifier(max_iter=2000))        # raise max_iter to avoid convergence warnings
])

# ----- Hyperparameter grid -----
param_grid = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
    'mlp__activation': ['relu', 'tanh', 'logistic'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__alpha': [1e-5, 1e-4, 1e-3, 1e-2],
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__learning_rate_init': [0.001, 0.01, 0.1]
}

scoring = {
    "accuracy": "accuracy",
    "recall": make_scorer(recall_score, pos_label="Malignant")
}

# ----- GridSearch -----
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,     # you can use f1, roc_auc, etc.
    refit='recall',
    cv=30,
    n_jobs=-1
)

# ----- Fit -----
grid.fit(X_train_pca, y_train)

# ----- Results -----
print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)
print("Test accuracy:", grid.score(X_test_pca, y_test))


Best params: {'mlp__activation': 'logistic', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (100, 50), 'mlp__learning_rate': 'constant', 'mlp__learning_rate_init': 0.1, 'mlp__solver': 'adam'}
Best CV score: 0.9711111111111113
Test accuracy: 0.9523809523809523


Best params: {'mlp__activation': 'relu', 'mlp__alpha': 1e-05, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant', 'mlp__learning_rate_init': 0.1, 'mlp__solver': 'adam'}
Best CV score: 0.9711111111111113

## 3. MLP Function

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
import numpy as np

def mlp_cross_validation(X, y,
                         hidden_layer_sizes=(100,50),
                         activation='logistic',
                         solver='adam',
                         alpha=0.01,
                         learning_rate='constant',
                         learning_rate_init=0.1,
                         max_iter=2000,
                         k=5):
    """
    Perform K-fold cross-validation for an MLP model.
    
    Parameters:
        X, y : numpy arrays
        hidden_layer_sizes, activation, solver, alpha, learning_rate, 
        learning_rate_init, max_iter : MLP hyperparameters
        k (int): number of folds

    Returns:
        metrics (dict): mean and std of all metrics
        all_preds (list): predictions for each fold concatenated
    """

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies, f1s, recalls, aucs = [], [], [], []
    all_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"ðŸ”¹ Fold {fold+1}/{k}")

        # ---- SPLIT ----
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # ---- DEFINE MLP ----
        mlp = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            alpha=alpha,
            learning_rate=learning_rate,
            learning_rate_init=learning_rate_init,
            max_iter=max_iter
        )

        # ---- TRAIN ----
        mlp.fit(X_train, y_train)

        # ---- PREDICT ----
        y_pred = mlp.predict(X_val)
        y_proba = mlp.predict_proba(X_val)[:, 1]

        # ---- METRICS ----
        accuracies.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred, pos_label='Malignant'))
        recalls.append(recall_score(y_val, y_pred, pos_label='Malignant'))
        aucs.append(roc_auc_score(y_val, y_proba))

        all_preds.extend(y_pred)

    # ---- AGGREGATE METRICS ----
    metrics = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "auc_mean": np.mean(aucs),
        "auc_std": np.std(aucs),
    }

    return metrics, np.array(all_preds)


In [22]:
metrics, preds = mlp_cross_validation(
    X_train_pca, y_train.values,
    hidden_layer_sizes=(100,50),
    activation='relu',
    k=30
)

print(metrics)


ðŸ”¹ Fold 1/30
ðŸ”¹ Fold 2/30
ðŸ”¹ Fold 3/30
ðŸ”¹ Fold 4/30
ðŸ”¹ Fold 5/30
ðŸ”¹ Fold 6/30
ðŸ”¹ Fold 7/30
ðŸ”¹ Fold 8/30
ðŸ”¹ Fold 9/30
ðŸ”¹ Fold 10/30
ðŸ”¹ Fold 11/30
ðŸ”¹ Fold 12/30
ðŸ”¹ Fold 13/30
ðŸ”¹ Fold 14/30
ðŸ”¹ Fold 15/30
ðŸ”¹ Fold 16/30
ðŸ”¹ Fold 17/30
ðŸ”¹ Fold 18/30
ðŸ”¹ Fold 19/30
ðŸ”¹ Fold 20/30
ðŸ”¹ Fold 21/30
ðŸ”¹ Fold 22/30
ðŸ”¹ Fold 23/30
ðŸ”¹ Fold 24/30
ðŸ”¹ Fold 25/30
ðŸ”¹ Fold 26/30
ðŸ”¹ Fold 27/30
ðŸ”¹ Fold 28/30
ðŸ”¹ Fold 29/30
ðŸ”¹ Fold 30/30
{'accuracy_mean': np.float64(0.962638888888889), 'accuracy_std': np.float64(0.05050077159904355), 'f1_mean': np.float64(0.9183636135106724), 'f1_std': np.float64(0.18426060070817363), 'recall_mean': np.float64(0.9073532948532947), 'recall_std': np.float64(0.1897336532952906), 'auc_mean': np.float64(0.9866847041847041), 'auc_std': np.float64(0.03456032195263355)}
