# BASELINE: Logistic Regression

## 1. Preprocessing

In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pointbiserialr
from sklearn.metrics import make_scorer, recall_score, confusion_matrix


In [2]:
def preprocess_data(
    file='data/wdbc.data',
    corr_threshold=0.3,
    test_size=0.2,
    random_state=42
):
    # -------------------------------------
    # 1) Load dataset and rename columns
    # -------------------------------------
    df = pd.read_csv(file, header=None)
    cols = ["id", "diagnosis"] + [
        f"{feat}_{stat}"
        for stat in ["mean", "se", "worst"]
        for feat in ["radius", "texture", "perimeter", "area",
                     "smoothness", "compactness", "concavity",
                     "concave_points", "symmetry", "fractal_dimension"]
    ]
    df.columns = cols

    # -------------------------------------
    # 2) Numerical encoding for correlation filtering
    # -------------------------------------
    df["diagnosis"] = df["diagnosis"].map({"B": 0.0, "M": 1.0})

    correlated_columns = ["diagnosis"]
    for col in df.drop(columns=["diagnosis", "id"]).columns:
        r, _ = pointbiserialr(df["diagnosis"], df[col].astype(float))
        if r > corr_threshold:
            correlated_columns.append(col)

    df = df[correlated_columns]

    # Convert back to category labels
    df["diagnosis"] = df["diagnosis"].map({0.0: "Benign", 1.0: "Malignant"}).astype("category")

    # -------------------------------------
    # 3) Split into X and y
    # -------------------------------------
    X = df.drop(columns=["diagnosis"])
    y = df["diagnosis"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    # -------------------------------------
    # 4) Scaling
    # -------------------------------------
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # -------------------------------------
    # 5) PCA
    # -------------------------------------
    pca = PCA()
    pca.fit(X_train_scaled)

    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca  = pca.transform(X_test_scaled)

    # -------------------------------------
    # 6) Return everything useful
    # -------------------------------------
    return {
        "df": df,                            
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
        "X_train_scaled": X_train_scaled,
        "X_test_scaled": X_test_scaled,
        "X_train_pca": X_train_pca,
        "X_test_pca": X_test_pca
    }


In [3]:
prep_data = preprocess_data()

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]
X_train_scaled = prep_data["X_train_scaled"]
X_test_scaled = prep_data["X_test_scaled"]
X_train_pca = prep_data["X_train_pca"]
X_test_pca = prep_data["X_test_pca"]

# 2. Hyperparameters tunning

In [4]:
# Pipeline: scaling + logistic regression
pipe = Pipeline([
    ("logreg", LogisticRegression(max_iter=500))
])

# Hyperparameter grid
param_grid = {
    "logreg__C": [0.01, 0.1, 1, 10, 100],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear", "saga"]    # both support l1/l2
}


scoring = {
    "accuracy": "accuracy",
    "recall": make_scorer(recall_score, pos_label="Malignant")
}
# Grid Search
grid = GridSearchCV(
    pipe,
    param_grid,
    cv=30,
    scoring=scoring,
    refit="accuracy",   # or "recall", choose which one to refit on
    n_jobs=-1
)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)
print("Grid Params:", grid.cv_results_)

# Test results
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))




Best params: {'logreg__C': 10, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
Best CV accuracy: 0.9580555555555557
Grid Params: {'mean_fit_time': array([0.01497434, 0.08299166, 0.00425266, 0.0655037 , 0.11667736,
       0.06556834, 0.00346849, 0.04579907, 0.24761492, 0.06779709,
       0.00402274, 0.04288935, 0.16618696, 0.06141868, 0.004724  ,
       0.04459937, 0.88870273, 0.06140546, 0.00500361, 0.04206856]), 'std_fit_time': array([0.00247498, 0.00512271, 0.00098919, 0.01792992, 0.01692833,
       0.00630705, 0.00092703, 0.00435796, 0.05385883, 0.00521453,
       0.00093086, 0.00375064, 0.11069994, 0.00269172, 0.00093383,
       0.00186672, 0.27892013, 0.00334652, 0.00105438, 0.00233456]), 'mean_score_time': array([0.00427875, 0.00456913, 0.00327256, 0.0037645 , 0.00292711,
       0.00315057, 0.00225051, 0.0029099 , 0.00281171, 0.00303221,
       0.00236134, 0.00237025, 0.00268761, 0.0026778 , 0.00240195,
       0.00262473, 0.0022337 , 0.00256521, 0.00225968, 0.00266089]), 

Best params for Linear Regression:

{'logreg__C': 10, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}

## 3. Logistic Regression Function

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
import numpy as np

def logistic_cross_validation(X, y,
                              penalty='l2',
                              C=10,
                              solver='liblinear',
                              max_iter=1000,
                              k=30):
    """
    Manual Stratified K-fold cross-validation for Logistic Regression.

    Parameters:
        X, y : array or DataFrame
        penalty : 'l1', 'l2', etc.
        C : inverse of regularization strength
        solver : optimization algorithm ('lbfgs','liblinear', etc.)
        max_iter : number of iterations
        k : number of folds

    Returns:
        metrics (dict): CV means and stds
        preds (array): concatenated predictions from all folds
    """

    # Convert string labels to numeric if necessary
    if y.dtype == object or isinstance(y[0], str):
        y = np.where(y == "Malignant", 1, 0)

    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    accuracies, f1s, recalls, aucs = [], [], [], []
    all_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold+1}/{k}")

        # ---- SPLIT (pandas OR numpy) ----
        if hasattr(X, "iloc"):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
        else:
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

        # ---- SCALING inside fold ----
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # ---- MODEL ----
        lr = LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            max_iter=max_iter,
            # class_weight='balanced',   # <â€” optionally uncomment if you want fairness across classes
        )

        lr.fit(X_train, y_train)

        # ---- PREDICT ----
        y_pred = lr.predict(X_val)
        y_proba = lr.predict_proba(X_val)[:, 1]

        # ---- METRICS ----
        accuracies.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred, pos_label=1))
        recalls.append(recall_score(y_val, y_pred, pos_label=1))
        aucs.append(roc_auc_score(y_val, y_proba))

        all_preds.extend(y_pred)

    metrics = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "auc_mean": np.mean(aucs),
        "auc_std": np.std(aucs),
    }

    return metrics, np.array(all_preds)


In [6]:
metrics_logreg, preds_logreg = logistic_cross_validation(
    X_train,
    y_train,
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    k=30
)

print(metrics_logreg)


Fold 1/30
Fold 2/30
Fold 3/30
Fold 4/30
Fold 5/30
Fold 6/30
Fold 7/30
Fold 8/30
Fold 9/30
Fold 10/30
Fold 11/30
Fold 12/30
Fold 13/30
Fold 14/30
Fold 15/30
Fold 16/30
Fold 17/30
Fold 18/30
Fold 19/30
Fold 20/30
Fold 21/30
Fold 22/30
Fold 23/30
Fold 24/30
Fold 25/30
Fold 26/30
Fold 27/30
Fold 28/30
Fold 29/30
Fold 30/30
{'accuracy_mean': np.float64(0.9715277777777779), 'accuracy_std': np.float64(0.03684746042139367), 'f1_mean': np.float64(0.9613701113701114), 'f1_std': np.float64(0.04943191611680371), 'recall_mean': np.float64(0.948888888888889), 'recall_std': np.float64(0.07828406646137126), 'auc_mean': np.float64(0.9975308641975308), 'auc_std': np.float64(0.009238660214256645)}
