In [19]:
import numpy as np
import pandas as pd
import time

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.feature_selection import VarianceThreshold, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier


In [20]:
#Load data (Breast Cancer) and split
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")  # 1 = benign, 0 = malignant (dataset-specific)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)


Train shape: (426, 30)
Test shape : (143, 30)


In [21]:
#Helper: evaluate pipeline (Accuracy + AUC + Runtime)
def evaluate_model(name, model, Xtr, ytr, Xte, yte):
    start = time.time()
    model.fit(Xtr, ytr)
    fit_time = time.time() - start
    
    proba = model.predict_proba(Xte)[:, 1]
    pred = (proba >= 0.5).astype(int)
    
    acc = accuracy_score(yte, pred)
    auc = roc_auc_score(yte, proba)
    
    return {
        "Stage": name,
        "Accuracy": acc,
        "AUC": auc,
        "Fit_time_sec": fit_time
    }


In [22]:
#Baseline model (no feature selection)
baseline_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, solver="liblinear"))
])

baseline_result = evaluate_model(
    "Baseline (All features)",
    baseline_model,
    X_train, y_train,
    X_test, y_test
)

baseline_result


{'Stage': 'Baseline (All features)',
 'Accuracy': 0.986013986013986,
 'AUC': 0.9976939203354298,
 'Fit_time_sec': 0.012922525405883789}

<h2> Stage 1 — FILTER METHODS (fast noise removal)</h2>

In [25]:
# Stage 1A) Variance Threshold (remove near-constant features)
var_selector = VarianceThreshold(threshold=0.0)  # remove only zero-variance features
X_train_var = var_selector.fit_transform(X_train)
X_test_var = var_selector.transform(X_test)

kept_var_mask = var_selector.get_support()
kept_var_features = X_train.columns[kept_var_mask]

print("After VarianceThreshold:", X_train_var.shape[1], "features kept")


After VarianceThreshold: 30 features kept


In [26]:
# Stage 1B) Correlation pruning (remove redundant features)
def correlation_prune(df, threshold=0.95):
    corr = df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    kept = [c for c in df.columns if c not in to_drop]
    return kept, to_drop

X_train_var_df = pd.DataFrame(X_train_var, columns=kept_var_features, index=X_train.index)
X_test_var_df  = pd.DataFrame(X_test_var,  columns=kept_var_features, index=X_test.index)

kept_corr_features, dropped_corr_features = correlation_prune(X_train_var_df, threshold=0.95)

X_train_filter = X_train_var_df[kept_corr_features]
X_test_filter  = X_test_var_df[kept_corr_features]

print("After correlation pruning:", X_train_filter.shape[1], "features kept")
print("Dropped due to high correlation:", len(dropped_corr_features))


After correlation pruning: 23 features kept
Dropped due to high correlation: 7


In [27]:
# Evaluate after Filter stage
filter_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, solver="liblinear"))
])

filter_result = evaluate_model(
    "Stage 1: Filter (Variance + Corr prune)",
    filter_model,
    X_train_filter, y_train,
    X_test_filter, y_test
)

filter_result


{'Stage': 'Stage 1: Filter (Variance + Corr prune)',
 'Accuracy': 0.9790209790209791,
 'AUC': 0.9962264150943396,
 'Fit_time_sec': 0.009938240051269531}

<h2> Stage 2 — WRAPPER METHOD (performance refinement)</h2>

In [28]:
#Sequential Forward Selection (SFS) with Logistic Regression.
wrapper_estimator = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, solver="liblinear"))
])

# pick a target number of features (you can tune this)
n_select = min(10, X_train_filter.shape[1])

sfs = SequentialFeatureSelector(
    estimator=wrapper_estimator,
    n_features_to_select=n_select,
    direction="forward",
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

start = time.time()
sfs.fit(X_train_filter, y_train)
sfs_time = time.time() - start

sfs_mask = sfs.get_support()
sfs_features = X_train_filter.columns[sfs_mask]

X_train_wrap = X_train_filter[sfs_features]
X_test_wrap  = X_test_filter[sfs_features]

print("Wrapper selected features:", len(sfs_features))
print("SFS time (sec):", round(sfs_time, 3))
print("Selected features:\n", list(sfs_features))


Wrapper selected features: 10
SFS time (sec): 20.266
Selected features:
 ['mean radius', 'mean symmetry', 'radius error', 'compactness error', 'concave points error', 'fractal dimension error', 'worst texture', 'worst smoothness', 'worst concave points', 'worst symmetry']


In [29]:
#Evaluate after Wrapper stage
wrapper_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, solver="liblinear"))
])

wrapper_result = evaluate_model(
    f"Stage 2: Wrapper (SFS top {len(sfs_features)})",
    wrapper_model,
    X_train_wrap, y_train,
    X_test_wrap, y_test
)

# include selection time too
wrapper_result["Selection_time_sec"] = sfs_time
wrapper_result


{'Stage': 'Stage 2: Wrapper (SFS top 10)',
 'Accuracy': 0.972027972027972,
 'AUC': 0.9958071278825996,
 'Fit_time_sec': 0.007886648178100586,
 'Selection_time_sec': 20.265742540359497}

<h2>Stage 3 — EMBEDDED METHOD (final selection)</h2>

In [30]:
embedded_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=3000,
        solver="liblinear",   # supports L1
        penalty="l1",
        C=1.0                 # inverse of regularization strength (tune if needed)
    ))
])

embedded_result = evaluate_model(
    "Stage 3: Embedded (L1 Logistic Regression)",
    embedded_model,
    X_train_wrap, y_train,
    X_test_wrap, y_test
)

embedded_result


{'Stage': 'Stage 3: Embedded (L1 Logistic Regression)',
 'Accuracy': 0.965034965034965,
 'AUC': 0.9955974842767296,
 'Fit_time_sec': 0.0094451904296875}

In [31]:
#Inspect which features survived L1 (non-zero coefficients)
# Fit on training to read coefficients
embedded_model.fit(X_train_wrap, y_train)

coef = embedded_model.named_steps["clf"].coef_.ravel()
final_features = X_train_wrap.columns[coef != 0]
dropped_features = X_train_wrap.columns[coef == 0]

print("Final features kept by L1:", len(final_features))
print("Kept:", list(final_features))
print("Dropped:", list(dropped_features))


Final features kept by L1: 7
Kept: ['mean radius', 'radius error', 'compactness error', 'worst texture', 'worst smoothness', 'worst concave points', 'worst symmetry']
Dropped: ['mean symmetry', 'concave points error', 'fractal dimension error']


In [32]:
#Final comparison

results = []

results.append({**baseline_result, "Num_features": X_train.shape[1]})
results.append({**filter_result,   "Num_features": X_train_filter.shape[1]})
results.append({**wrapper_result,  "Num_features": X_train_wrap.shape[1]})
results.append({**embedded_result, "Num_features": len(final_features)})

df_results = pd.DataFrame(results)

# Make it clean
cols = ["Stage", "Num_features", "Accuracy", "AUC", "Fit_time_sec"]
extra_cols = [c for c in df_results.columns if c not in cols]
df_results = df_results[cols + extra_cols]

df_results



Unnamed: 0,Stage,Num_features,Accuracy,AUC,Fit_time_sec,Selection_time_sec
0,Baseline (All features),30,0.986014,0.997694,0.012923,
1,Stage 1: Filter (Variance + Corr prune),23,0.979021,0.996226,0.009938,
2,Stage 2: Wrapper (SFS top 10),10,0.972028,0.995807,0.007887,20.265743
3,Stage 3: Embedded (L1 Logistic Regression),7,0.965035,0.995597,0.009445,
