In [None]:
"""
WEEK 8 DEMO (Hotel Bookings Dataset): Filter + Wrapper + Embedded Feature Selection
Dataset: hotel_bookings.csv (GitHub raw)

Concepts covered this week:
1) Filter methods: VarianceThreshold, Mutual Information (MI)
2) Wrapper methods: Sequential Forward Selection (SFS), RFE
3) Embedded methods: L1 Logistic Regression (LASSO-style), Tree-based feature importance
4) Hybrid pipeline: Filter -> Wrapper -> Embedded
5) Compare accuracy/AUC + runtime + selected feature counts

Target used: 'is_canceled' (0/1)
"""

import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import (
    VarianceThreshold,
    SelectKBest,
    mutual_info_classif,
    SequentialFeatureSelector,
    RFE
)

# -----------------------------
# 1) Load dataset
# -----------------------------
url = "https://raw.githubusercontent.com/swapnilsaurav/Dataset/refs/heads/master/hotel_bookings.csv"
df = pd.read_csv(url)

# Basic cleaning: drop duplicates if any (optional)
df = df.drop_duplicates()

# Target
target = "is_canceled"
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found. Available columns: {df.columns.tolist()}")

#contains missing values ?
df[target].isna().sum()
# you cannot train a supervised model without a valid label.
df = df.dropna(subset=[target])

y = df[target].astype(int)
X = df.drop(columns=[target])

# -----------------------------
# 2) Identify numeric & categorical columns
# -----------------------------
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

# If dataset has date-like strings that are objects, keep them categorical for this demo.
print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))
print("Rows:", len(df))

# -----------------------------
# 3) Train/Test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# -----------------------------
# 4) Preprocessor (impute + encode + scale)
#    - We create a "model-ready matrix" from raw dataframe.
# -----------------------------
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features)
    ],
    remainder="drop"
)

# -----------------------------
# Utility: feature names after preprocessing
# -----------------------------
def get_feature_names(preprocessor, numeric_features, categorical_features):
    """
    Extract output feature names after ColumnTransformer.
    Works for OneHotEncoder and numeric passthrough.
    """
    # Fit a clone-like? We'll use the already-fitted preprocessor.
    num_names = numeric_features
    # For categorical, get onehot names
    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    cat_names = ohe.get_feature_names_out(categorical_features).tolist()
    return num_names + cat_names

# -----------------------------
# 5) Helper: Evaluate any fitted pipeline/model (Accuracy + AUC + Runtime)
# -----------------------------
def evaluate_pipeline(name, pipe, Xtr, ytr, Xte, yte):
    start = time.time()
    pipe.fit(Xtr, ytr)
    fit_time = time.time() - start

    proba = pipe.predict_proba(Xte)[:, 1]
    pred = (proba >= 0.5).astype(int)

    return {
        "Stage": name,
        "Accuracy": accuracy_score(yte, pred),
        "AUC": roc_auc_score(yte, proba),
        "Fit_time_sec": fit_time
    }

# ============================================================
# A) BASELINE (no feature selection)
# ============================================================
baseline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=4000, solver="liblinear"))
])

baseline_res = evaluate_pipeline("Baseline (All features)", baseline, X_train, y_train, X_test, y_test)
print("\nBASELINE:", baseline_res)

# ============================================================
# B) FILTER METHODS
#    We demonstrate TWO filter ideas:
#    B1) VarianceThreshold (post-preprocessing)
#    B2) Mutual Information (SelectKBest) (post-preprocessing)
# ============================================================

# --- B1) VarianceThreshold ---
# Important: we apply VarianceThreshold AFTER preprocessing
# because raw categorical strings must be encoded first.
filter_var = Pipeline([
    ("prep", preprocessor),
    ("var", VarianceThreshold(threshold=0.01)),  # 0.01 makes sense AFTER scaling/onehot
    ("clf", LogisticRegression(max_iter=4000, solver="liblinear"))
])

filter_var_res = evaluate_pipeline("Filter: VarianceThreshold(0.01)", filter_var, X_train, y_train, X_test, y_test)
print("\nFILTER VAR:", filter_var_res)

# --- B2) Mutual Information (MI) SelectKBest ---
# Select top-K features by MI score with the target.
K_MI = 40  # you can tune this (e.g., 20/40/60). Keep moderate for demo.
filter_mi = Pipeline([
    ("prep", preprocessor),
    ("mi", SelectKBest(score_func=mutual_info_classif, k=K_MI)),
    ("clf", LogisticRegression(max_iter=4000, solver="liblinear"))
])

filter_mi_res = evaluate_pipeline(f"Filter: MutualInfo top {K_MI}", filter_mi, X_train, y_train, X_test, y_test)
print("\nFILTER MI:", filter_mi_res)

# To show which MI features were selected, we need a fitted preprocessor first:
_ = preprocessor.fit(X_train, y_train)
all_feature_names = get_feature_names(preprocessor, numeric_features, categorical_features)

# Fit MI selector on transformed train to extract selected feature names:
Xtr_mat = preprocessor.transform(X_train)
mi_selector = SelectKBest(score_func=mutual_info_classif, k=K_MI).fit(Xtr_mat, y_train)
mi_mask = mi_selector.get_support()
mi_selected_features = [f for f, keep in zip(all_feature_names, mi_mask) if keep]
print("\nTop MI-selected features (sample):", mi_selected_features[:15], "...")

# ============================================================
# C) WRAPPER METHODS
#    C1) Sequential Forward Selection (SFS)
#    C2) RFE (Recursive Feature Elimination)
#
# Note: Wrapper methods can be expensive. We'll use manageable feature sizes.
# Strategy: start from a filtered representation (MI) to reduce dimensionality first.
# ============================================================

# Prepare filtered matrix using MI (so wrapper doesn't search over thousands of one-hot columns)
Xtr_mi = mi_selector.transform(Xtr_mat)
Xte_mi = mi_selector.transform(preprocessor.transform(X_test))

# --- C1) SFS ---
# SFS chooses features that maximize cross-validated ROC-AUC.
# We'll pick a compact number (e.g., 15).
N_SFS = min(15, Xtr_mi.shape[1])

sfs_estimator = LogisticRegression(max_iter=4000, solver="liblinear")
sfs = SequentialFeatureSelector(
    estimator=sfs_estimator,
    n_features_to_select=N_SFS,
    direction="forward",
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

start = time.time()
sfs.fit(Xtr_mi, y_train)
sfs_time = time.time() - start

sfs_mask = sfs.get_support()
sfs_selected = [f for f, keep in zip(mi_selected_features, sfs_mask) if keep]
print("\nSFS selected features:", sfs_selected)

# Train a model on SFS features
Xtr_sfs = Xtr_mi[:, sfs_mask]
Xte_sfs = Xte_mi[:, sfs_mask]

start = time.time()
sfs_estimator.fit(Xtr_sfs, y_train)
sfs_fit_time = time.time() - start

proba = sfs_estimator.predict_proba(Xte_sfs)[:, 1]
pred = (proba >= 0.5).astype(int)
wrapper_sfs_res = {
    "Stage": f"Wrapper: SFS (from MI-{K_MI} -> {N_SFS})",
    "Accuracy": accuracy_score(y_test, pred),
    "AUC": roc_auc_score(y_test, proba),
    "Fit_time_sec": sfs_fit_time,
    "Selection_time_sec": sfs_time
}
print("\nWRAPPER SFS:", wrapper_sfs_res)

# --- C2) RFE ---
# RFE repeatedly fits a model and removes the weakest features.
# We'll reduce MI-K features to N_RFE.
N_RFE = min(20, Xtr_mi.shape[1])
rfe_estimator = LogisticRegression(max_iter=4000, solver="liblinear")

rfe = RFE(estimator=rfe_estimator, n_features_to_select=N_RFE, step=1)

start = time.time()
rfe.fit(Xtr_mi, y_train)
rfe_time = time.time() - start

rfe_mask = rfe.get_support()
rfe_selected = [f for f, keep in zip(mi_selected_features, rfe_mask) if keep]
print("\nRFE selected features:", rfe_selected)

Xtr_rfe = Xtr_mi[:, rfe_mask]
Xte_rfe = Xte_mi[:, rfe_mask]

start = time.time()
rfe_estimator.fit(Xtr_rfe, y_train)
rfe_fit_time = time.time() - start

proba = rfe_estimator.predict_proba(Xte_rfe)[:, 1]
pred = (proba >= 0.5).astype(int)
wrapper_rfe_res = {
    "Stage": f"Wrapper: RFE (from MI-{K_MI} -> {N_RFE})",
    "Accuracy": accuracy_score(y_test, pred),
    "AUC": roc_auc_score(y_test, proba),
    "Fit_time_sec": rfe_fit_time,
    "Selection_time_sec": rfe_time
}
print("\nWRAPPER RFE:", wrapper_rfe_res)

# ============================================================
# D) EMBEDDED METHODS
#    D1) L1 Logistic Regression (LASSO-like)
#    D2) RandomForest feature importance
#
# We'll apply embedded methods on the WRAPPER-selected sets (SFS or RFE),
# which is the "Hybrid pipeline" idea: Filter -> Wrapper -> Embedded.
# ============================================================

# --- D1) L1 Logistic Regression on SFS-selected features ---
l1 = LogisticRegression(max_iter=6000, solver="liblinear", penalty="l1", C=1.0)

start = time.time()
l1.fit(Xtr_sfs, y_train)
l1_fit_time = time.time() - start

coef = l1.coef_.ravel()
final_mask = (coef != 0)
final_features = [f for f, keep in zip(sfs_selected, final_mask) if keep]

Xte_final = Xte_sfs[:, final_mask]
proba = l1.predict_proba(Xte_final)[:, 1]  # model expects same columns used in fit; we must refit on final columns

# Refit properly on reduced set
l1_final = LogisticRegression(max_iter=6000, solver="liblinear", penalty="l1", C=1.0)
l1_final.fit(Xtr_sfs[:, final_mask], y_train)
proba = l1_final.predict_proba(Xte_sfs[:, final_mask])[:, 1]
pred = (proba >= 0.5).astype(int)

embedded_l1_res = {
    "Stage": f"Embedded: L1 Logistic (Hybrid: MI-{K_MI} -> SFS-{N_SFS} -> L1)",
    "Accuracy": accuracy_score(y_test, pred),
    "AUC": roc_auc_score(y_test, proba),
    "Fit_time_sec": l1_fit_time,
    "Final_features_count": int(final_mask.sum())
}
print("\nEMBEDDED L1:", embedded_l1_res)
print("Final features kept by L1:", final_features)

# --- D2) RandomForest importance on SFS-selected features ---
rf = RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)

start = time.time()
rf.fit(Xtr_sfs, y_train)
rf_time = time.time() - start

rf_importances = pd.Series(rf.feature_importances_, index=sfs_selected).sort_values(ascending=False)
top_rf = rf_importances.head(15)

# Evaluate RF directly (non-linear embedded model)
proba = rf.predict_proba(Xte_sfs)[:, 1]
pred = (proba >= 0.5).astype(int)

embedded_rf_res = {
    "Stage": f"Embedded: RandomForest (on SFS-{N_SFS})",
    "Accuracy": accuracy_score(y_test, pred),
    "AUC": roc_auc_score(y_test, proba),
    "Fit_time_sec": rf_time
}
print("\nEMBEDDED RF:", embedded_rf_res)
print("\nTop RF important features:\n", top_rf)

# ============================================================
# E) Summary Table (Compare everything)
# ============================================================
rows = []
rows.append({**baseline_res, "Num_features": "All (post-prep)"})
rows.append({**filter_var_res, "Num_features": "Reduced (var filter)"})
rows.append({**filter_mi_res, "Num_features": f"Top-{K_MI} (MI)"})
rows.append({**wrapper_sfs_res, "Num_features": f"{N_SFS} (SFS)"})
rows.append({**wrapper_rfe_res, "Num_features": f"{N_RFE} (RFE)"})
rows.append({**embedded_l1_res, "Num_features": embedded_l1_res.get("Final_features_count", "NA")})
rows.append({**embedded_rf_res, "Num_features": f"{N_SFS} (SFS input)"})

summary = pd.DataFrame(rows)

# Clean column order
base_cols = ["Stage", "Num_features", "Accuracy", "AUC", "Fit_time_sec"]
extra_cols = [c for c in summary.columns if c not in base_cols]
summary = summary[base_cols + extra_cols]

print("\n\n===== FINAL COMPARISON =====")
print(summary)


Numeric features: 20
Categorical features: 12
Rows: 119380

BASELINE: {'Stage': 'Baseline (All features)', 'Accuracy': 1.0, 'AUC': 1.0, 'Fit_time_sec': 2.8037643432617188}

FILTER VAR: {'Stage': 'Filter: VarianceThreshold(0.01)', 'Accuracy': 1.0, 'AUC': 1.0, 'Fit_time_sec': 3.5674855709075928}
