# Classification

**Goal:**  
Train classifiers to predict tumor diagnosis, address class imbalance, and evaluate performance using balanced metrics.  
We also validate feature selection by examining model-based importance scores to ensure interpretability and robustness.



## Overview
This notebook builds predictive models on the pruned breast cancer dataset prepared in **Notebook 03 (Preprocessing)**.  
The workflow includes:

- **Model training:** Fit multiple candidate classifiers (e.g., Logistic Regression, Random Forest, Gradient Boosting).  
- **Class imbalance handling:** Apply stratified sampling and threshold adjustments to balance sensitivity and specificity.  
- **Performance evaluation:** Use accuracy, precision, recall, F1-score, ROC-AUC, and confusion matrices for fair comparison.  
- **Feature validation:** Assess feature importance to confirm that the pruned and engineered predictors contribute meaningfully.  
- **Threshold optimization:** Tune decision thresholds to reflect clinical priorities.   



##Setup


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os


from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay, roc_curve, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.base import clone

## Stratified Train-Test Split

In [None]:
file_path = "/content/drive/My Drive/Portfolio/DataSciencePortfolio/Projects/Breast-Cancer/data/preprocessed"

# Load df_pruned
df_pruned = pd.read_csv(f"{file_path}/breast_cancer_pruned.csv")

# Features and labels from pruned dataset
X = df_pruned.drop(columns=['diagnosis'])
y = df_pruned['diagnosis']   # already numeric

# Use fractional test_size to guarantee 114 rows
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=114/len(df_pruned), stratify=y, random_state=42
)

# Reset index of test sets
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

save_path = "/content/drive/My Drive/Portfolio/DataSciencePortfolio/Projects/Breast-Cancer/models/"

# Save test set
joblib.dump(X_test, f"{save_path}/X_test.pkl")
joblib.dump(y_test, f"{save_path}/y_test.pkl")

print("Final hold-out test set saved")
print("Test size:", len(y_test))
print("Distribution:\n", y_test.value_counts())

Final hold-out test set saved
Test size: 114
Distribution:
 diagnosis
0    72
1    42
Name: count, dtype: int64


##  Model Pipelines

In [None]:
# Config
RANDOM_STATE = 42
USE_SMOTE = True          # toggle resampling
CLASS_WEIGHT = "balanced" # None or "balanced"

# Base models with explicit hyperparameters
lr = LogisticRegression(
    max_iter=2000,
    class_weight=CLASS_WEIGHT,
    solver="liblinear",   # use "lbfgs" if you expand features
    random_state=RANDOM_STATE
)

gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=RANDOM_STATE
)

# Build pipelines cleanly
pipelines = {}

# Logistic Regression pipeline
steps_lr = []
if USE_SMOTE:
    steps_lr.append(("smote", SMOTE(random_state=RANDOM_STATE)))
steps_lr += [("scaler", StandardScaler()), ("clf", lr)]
pipelines["lr"] = ImbPipeline(steps=steps_lr)

# Gradient Boosting pipeline
steps_gb = []
if USE_SMOTE:
    steps_gb.append(("smote", SMOTE(random_state=RANDOM_STATE)))
steps_gb += [("clf", gb)]
pipelines["gb"] = ImbPipeline(steps=steps_gb)

print("Refined model pipelines defined")
for name, pipe in pipelines.items():
    print(f"{name} pipeline: {pipe}")

Refined model pipelines defined
lr pipeline: Pipeline(steps=[('smote', SMOTE(random_state=42)), ('scaler', StandardScaler()),
                ('clf',
                 LogisticRegression(class_weight='balanced', max_iter=2000,
                                    random_state=42, solver='liblinear'))])
gb pipeline: Pipeline(steps=[('smote', SMOTE(random_state=42)),
                ('clf',
                 GradientBoostingClassifier(n_estimators=200,
                                            random_state=42))])


## Stratified K-Fold Cross-Validation with Threshold Tuning + Save Thresholds

In [None]:
# Config
CV_SPLITS = 5
RANDOM_STATE = 42
THRESHOLDS = np.linspace(0.3, 0.7, 9)  # grid for threshold tuning
file_path = "/content/drive/My Drive/Portfolio/DataSciencePortfolio/Projects/Breast-Cancer/models"

# Helper: evaluate metrics across thresholds
def evaluate_thresholds(y_true, y_prob, thresholds):
    records = []
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='binary', zero_division=0
        )
        records.append({
            "threshold": t,
            "tn": tn, "fp": fp, "fn": fn, "tp": tp,
            "precision": precision, "recall": recall, "f1": f1,
            "specificity": tn / (tn + fp) if (tn + fp) > 0 else 0.0,
            "roc_auc": roc_auc_score(y_true, y_prob)
        })
    return pd.DataFrame(records)

# Stratified CV loop
skf = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
cv_results = {}

# Only run for LR and GB
for name in ["lr", "gb"]:
    pipe = pipelines[name]
    fold_records = []
    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Fit pipeline clone
        pipe_fold = clone(pipe)
        pipe_fold.fit(X_train, y_train)

        # Predict probabilities
        y_prob = pipe_fold.predict_proba(X_test)[:, 1]

        # Threshold tuning
        thres_df = evaluate_thresholds(y_test, y_prob, THRESHOLDS)
        best = thres_df.iloc[thres_df["f1"].idxmax()]  # choose threshold maximizing F1

        fold_records.append({
            "fold": fold_idx,
            "best_threshold": best["threshold"],
            "best_f1": best["f1"],
            "best_recall": best["recall"],
            "best_precision": best["precision"],
            "best_specificity": best["specificity"],
            "roc_auc": best["roc_auc"]
        })
    cv_results[name] = pd.DataFrame(fold_records)

    # Save mean threshold for this model
    mean_threshold = cv_results[name]["best_threshold"].mean()
    joblib.dump(mean_threshold, f"{file_path}/threshold_{name}.pkl")
    print(f" Saved {name.upper()} mean threshold: {mean_threshold:.2f}")

# Summaries
for name, df in cv_results.items():
    print(f"\nModel: {name}")
    print(df.describe()[[
        "best_threshold","best_f1","best_recall",
        "best_precision","best_specificity","roc_auc"
    ]])

 Saved LR mean threshold: 0.45
 Saved GB mean threshold: 0.39

Model: lr
       best_threshold   best_f1  best_recall  best_precision  \
count            5.00  5.000000     5.000000        5.000000   
mean             0.45  0.973712     0.962348        0.985825   
std              0.10  0.013148     0.026959        0.012942   
min              0.30  0.962963     0.928571        0.976190   
25%              0.40  0.964706     0.953488        0.976190   
50%              0.50  0.964706     0.953488        0.976744   
75%              0.50  0.987952     0.976190        1.000000   
max              0.55  0.988235     1.000000        1.000000   

       best_specificity   roc_auc  
count          5.000000  5.000000  
mean           0.991588  0.993993  
std            0.007679  0.007283  
min            0.985915  0.981481  
25%            0.985915  0.994104  
50%            0.986111  0.996725  
75%            1.000000  0.997988  
max            1.000000  0.999669  

Model: gb
       best_thr

## Ensemble Stacking with Threshold Save

## Save all models and thresholds

In [None]:
save_path = "/content/drive/My Drive/Portfolio/DataSciencePortfolio/Projects/Breast-Cancer/models/"
os.makedirs(save_path, exist_ok=True)

# Dictionary of best thresholds from summary_df
thresholds = {
    "lr": 0.45,
    "gb": 0.39
}

# Refit each pipeline on full dataset before saving
models_to_save = {
    "lr": pipelines["lr"],
    "gb": pipelines["gb"]
}




for name, model in models_to_save.items():
    print(f"Refitting {name} on full dataset...")
    model.fit(X, y)

    # Save model
    joblib.dump(model, os.path.join(save_path, f"model_{name}.pkl"))

    # Save threshold
    joblib.dump(thresholds[name], os.path.join(save_path, f"threshold_{name}.pkl"))

    print(f" Saved: model_{name}.pkl and threshold_{name}.pkl")



print("\nAll models and thresholds saved successfully.")

Refitting lr on full dataset...
 Saved: model_lr.pkl and threshold_lr.pkl
Refitting gb on full dataset...
 Saved: model_gb.pkl and threshold_gb.pkl

All models and thresholds saved successfully.
