In [5]:
"""
Evaluating Feature Selection: Accuracy vs Simplicity

We will:
1. Train a baseline model using ALL features
2. Train the SAME model using SELECTED features only
3. Compare:
   - Accuracy / AUC
   - Number of features (complexity)
   - Training time
"""

import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# ---------------------------------------------------
# 1) Recreate SAME dataset (Customer Churn)
# ---------------------------------------------------
rng = np.random.default_rng(42)
n = 1200

df = pd.DataFrame({
    "tenure_months": rng.integers(1, 72, size=n),
    "monthly_charges": rng.normal(65, 18, size=n).clip(10, 150),
    "support_tickets_30d": rng.poisson(1.2, size=n).clip(0, 10),
    "avg_weekly_app_minutes": rng.normal(75, 30, size=n).clip(0, 300),
    "late_payments_6m": rng.poisson(0.6, size=n).clip(0, 8),

    "contract_type": rng.choice(
        ["Month-to-month", "One year", "Two year"], size=n, p=[0.55, 0.25, 0.20]
    ),
    "payment_method": rng.choice(
        ["Card", "UPI", "NetBanking", "Cash"], size=n, p=[0.35, 0.35, 0.20, 0.10]
    ),
    "internet_service": rng.choice(
        ["Fiber", "DSL", "None"], size=n, p=[0.55, 0.35, 0.10]
    ),
})

# Target
logit = (
    -0.04 * df["tenure_months"]
    + 0.018 * df["monthly_charges"]
    + 0.35 * df["support_tickets_30d"]
    + 0.22 * df["late_payments_6m"]
    - 0.006 * df["avg_weekly_app_minutes"]
)

logit += (df["contract_type"] == "Month-to-month") * 0.9
logit += (df["internet_service"] == "Fiber") * 0.25
logit += (df["payment_method"] == "Cash") * 0.35

prob = 1 / (1 + np.exp(-logit))
df["churn"] = rng.binomial(1, prob)

# ---------------------------------------------------
# 2) Train-test split
# ---------------------------------------------------
X = df.drop(columns=["churn"])
y = df["churn"]

numeric_features = [
    "tenure_months",
    "monthly_charges",
    "support_tickets_30d",
    "avg_weekly_app_minutes",
    "late_payments_6m"
]

categorical_features = [
    "contract_type",
    "payment_method",
    "internet_service"
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [6]:
# ---------------------------------------------------
# 3) BASELINE MODEL — ALL FEATURES
# ---------------------------------------------------
baseline_preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

baseline_model = Pipeline([
    ("prep", baseline_preprocess),
    ("clf", LogisticRegression(max_iter=800))
])

start = time.time()
baseline_model.fit(X_train, y_train)
baseline_time = time.time() - start

baseline_pred = baseline_model.predict(X_test)
baseline_proba = baseline_model.predict_proba(X_test)[:, 1]

baseline_acc = accuracy_score(y_test, baseline_pred)
baseline_auc = roc_auc_score(y_test, baseline_proba)



In [7]:
# ---------------------------------------------------
# 4) FEATURE-SELECTION MODEL — SELECTED FEATURES ONLY
# (Using top features from previous lesson)
# ---------------------------------------------------
selected_features = [
    "tenure_months",
    "monthly_charges",
    "support_tickets_30d",
    "late_payments_6m",
    "avg_weekly_app_minutes",
    "contract_type_Month-to-month",
    "payment_method_Cash",
    "internet_service_Fiber"
]

# Encode categoricals manually
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])

cat_feature_names = encoder.get_feature_names_out(categorical_features)

X_train_full = pd.concat(
    [
        X_train[numeric_features],
        pd.DataFrame(X_train_cat, columns=cat_feature_names, index=X_train.index)
    ],
    axis=1
)

X_test_full = pd.concat(
    [
        X_test[numeric_features],
        pd.DataFrame(X_test_cat, columns=cat_feature_names, index=X_test.index)
    ],
    axis=1
)

X_train_sel = X_train_full[selected_features]
X_test_sel = X_test_full[selected_features]

selected_model = LogisticRegression(max_iter=800)

start = time.time()
selected_model.fit(X_train_sel, y_train)
selected_time = time.time() - start

selected_pred = selected_model.predict(X_test_sel)
selected_proba = selected_model.predict_proba(X_test_sel)[:, 1]

selected_acc = accuracy_score(y_test, selected_pred)
selected_auc = roc_auc_score(y_test, selected_proba)



In [8]:
# ---------------------------------------------------
# 5) COMPARISON SUMMARY
# ---------------------------------------------------
print("\n=== MODEL COMPARISON: ACCURACY vs SIMPLICITY ===\n")

print("Baseline Model (All Features)")
print(f"Features used      : {X_train_full.shape[1]}")
print(f"Accuracy           : {baseline_acc:.4f}")
print(f"AUC                : {baseline_auc:.4f}")
print(f"Training time (s)  : {baseline_time:.4f}\n")

print("Selected-Feature Model")
print(f"Features used      : {X_train_sel.shape[1]}")
print(f"Accuracy           : {selected_acc:.4f}")
print(f"AUC                : {selected_auc:.4f}")
print(f"Training time (s)  : {selected_time:.4f}\n")

print("Reduction in features:",
      X_train_full.shape[1] - X_train_sel.shape[1])

print("\nEvaluation complete.")



=== MODEL COMPARISON: ACCURACY vs SIMPLICITY ===

Baseline Model (All Features)
Features used      : 15
Accuracy           : 0.7100
AUC                : 0.7733
Training time (s)  : 0.3505

Selected-Feature Model
Features used      : 8
Accuracy           : 0.7233
AUC                : 0.7773
Training time (s)  : 0.1205

Reduction in features: 7

Evaluation complete.
