In [11]:
"""
Practical Demo: Filter-Based Feature Selection using scikit-learn

Steps demonstrated:
1. Load dataset (Customer Churn)
2. Split into train/test (avoid data leakage)
3. Apply filter methods:
   - Correlation (numeric)
   - Chi-square (categorical)
   - Mutual Information (general)
4. Rank features
5. Select top features
6. Prepare clean feature set for modeling
"""

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import (
    SelectKBest,
    chi2,
    mutual_info_classif
)

In [12]:
# -----------------------------
# 1) Create a simple "Customer Churn" dataset (synthetic but realistic)
# -----------------------------
rng = np.random.default_rng(42)
n = 1200

df = pd.DataFrame({
    # numeric
    "tenure_months": rng.integers(1, 72, size=n),
    "monthly_charges": rng.normal(65, 18, size=n).clip(10, 150),
    "support_tickets_30d": rng.poisson(1.2, size=n).clip(0, 10),
    "avg_weekly_app_minutes": rng.normal(75, 30, size=n).clip(0, 300),
    "late_payments_6m": rng.poisson(0.6, size=n).clip(0, 8),

    # categorical
    "contract_type": rng.choice(["Month-to-month", "One year", "Two year"], size=n, p=[0.55, 0.25, 0.20]),
    "payment_method": rng.choice(["Card", "UPI", "NetBanking", "Cash"], size=n, p=[0.35, 0.35, 0.20, 0.10]),
    "internet_service": rng.choice(["Fiber", "DSL", "None"], size=n, p=[0.55, 0.35, 0.10]),
})

# Create churn probability (ground-truth signal) â€” higher churn for:
# - short tenure, high charges, more tickets, month-to-month contracts, more late payments, low app usage
logit = (
    -0.04 * df["tenure_months"]
    + 0.018 * df["monthly_charges"]
    + 0.35 * df["support_tickets_30d"]
    + 0.22 * df["late_payments_6m"]
    - 0.006 * df["avg_weekly_app_minutes"]
)

logit += (df["contract_type"] == "Month-to-month") * 0.9
logit += (df["internet_service"] == "Fiber") * 0.25
logit += (df["payment_method"] == "Cash") * 0.35

prob = 1 / (1 + np.exp(-logit))
df["churn"] = rng.binomial(1, prob)

In [13]:
# -----------------------------
# 2) Separate features & target
# -----------------------------
X = df.drop(columns=["churn"])
y = df["churn"]

numeric_features = [
    "tenure_months",
    "monthly_charges",
    "support_tickets_30d",
    "avg_weekly_app_minutes",
    "late_payments_6m"
]

categorical_features = [
    "contract_type",
    "payment_method",
    "internet_service"
]


In [14]:
# -----------------------------
# 3) Train-test split (IMPORTANT)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [15]:
# -----------------------------
# 4) Correlation (Numeric features)
# -----------------------------
corr_df = pd.concat([X_train[numeric_features], y_train], axis=1)
corr_scores = corr_df.corr()["churn"].drop("churn").abs()

print("\nCorrelation with Target (Numeric Features):")
print(corr_scores.sort_values(ascending=False))


Correlation with Target (Numeric Features):
tenure_months             0.355018
support_tickets_30d       0.193508
monthly_charges           0.087473
avg_weekly_app_minutes    0.085323
late_payments_6m          0.056418
Name: churn, dtype: float64


In [16]:
# -----------------------------
# 5) Encode categorical features
#    (Required for Chi-square & MI)
# -----------------------------
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_cat_encoded = encoder.fit_transform(X_train[categorical_features])
cat_feature_names = encoder.get_feature_names_out(categorical_features)

X_cat_encoded = pd.DataFrame(
    X_cat_encoded, columns=cat_feature_names, index=X_train.index
)


In [18]:
# -----------------------------
# 6) Chi-Square Test (Categorical)
# -----------------------------
chi_selector = SelectKBest(score_func=chi2, k="all")
chi_selector.fit(X_cat_encoded, y_train)

chi_scores = pd.Series(
    chi_selector.scores_, index=cat_feature_names
).sort_values(ascending=False)

print("\nChi-Square Scores (Categorical Features):")
print(chi_scores)



Chi-Square Scores (Categorical Features):
contract_type_Month-to-month    7.386843
payment_method_Cash             5.945496
contract_type_One year          4.977368
contract_type_Two year          3.733026
payment_method_NetBanking       2.642784
internet_service_None           2.341569
internet_service_DSL            1.982481
internet_service_Fiber          0.215464
payment_method_UPI              0.056575
payment_method_Card             0.013851
dtype: float64


In [19]:
# -----------------------------
# 7) Mutual Information (All features)
# -----------------------------
X_all = pd.concat(
    [X_train[numeric_features], X_cat_encoded], axis=1
)

mi_scores = mutual_info_classif(
    X_all, y_train, random_state=42
)

mi_scores = pd.Series(mi_scores, index=X_all.columns).sort_values(ascending=False)

print("\nMutual Information Scores (All Features):")
print(mi_scores)


Mutual Information Scores (All Features):
tenure_months                   0.083746
monthly_charges                 0.026168
internet_service_DSL            0.025982
support_tickets_30d             0.023648
payment_method_Card             0.010198
contract_type_Month-to-month    0.009576
payment_method_NetBanking       0.006327
payment_method_Cash             0.005914
internet_service_Fiber          0.002763
payment_method_UPI              0.001646
contract_type_Two year          0.001325
contract_type_One year          0.000000
late_payments_6m                0.000000
avg_weekly_app_minutes          0.000000
internet_service_None           0.000000
dtype: float64


In [20]:
# -----------------------------
# 8) Select Top-K Features (Filter step)
# -----------------------------
TOP_K = 10
selected_features = mi_scores.head(TOP_K).index.tolist()

print(f"\nTop {TOP_K} Selected Features:")
for i, f in enumerate(selected_features, 1):
    print(f"{i:2d}. {f}")


Top 10 Selected Features:
 1. tenure_months
 2. monthly_charges
 3. internet_service_DSL
 4. support_tickets_30d
 5. payment_method_Card
 6. contract_type_Month-to-month
 7. payment_method_NetBanking
 8. payment_method_Cash
 9. internet_service_Fiber
10. payment_method_UPI


In [21]:
# -----------------------------
# 9) Prepare Clean Feature Set
# -----------------------------
X_train_selected = X_all[selected_features]
X_test_selected = pd.concat(
    [
        X_test[numeric_features],
        pd.DataFrame(
            encoder.transform(X_test[categorical_features]),
            columns=cat_feature_names,
            index=X_test.index
        )
    ],
    axis=1
)[selected_features]

print("\nShape before selection:", X_all.shape)
print("Shape after selection:", X_train_selected.shape)

print("\nFeature selection complete.")
print("Clean input is now ready for modeling.")


Shape before selection: (900, 15)
Shape after selection: (900, 10)

Feature selection complete.
Clean input is now ready for modeling.
