In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2
from scipy import sparse

# Load dataset, keep types light
df = pd.read_csv("Dataset.csv", dtype=str, low_memory=False)

# Drop null target & cast
df = df.dropna(subset=["Default"])
df["Default"] = df["Default"].astype(int)

# Optional: sample for faster chi2 test
df_sample = df.sample(n=15000, random_state=42)  # reduce if needed

# Target and categorical features
y = df_sample["Default"]
cat_cols = df_sample.select_dtypes(include="object").columns.tolist()

# Skip known heavy columns (optional optimization)
exclude_cols = ["Population_Region_Relative", "Type_Organization"]
cat_cols = [c for c in cat_cols if c not in exclude_cols]

# Fill missing and encode
X_cat = df_sample[cat_cols].fillna("missing").astype(str)
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
X_encoded_sparse = encoder.fit_transform(X_cat)

# Chi² test
chi2_scores, p_values = chi2(X_encoded_sparse, y)
encoded_feature_names = encoder.get_feature_names_out(cat_cols)

# Assemble chi² results
chi2_df = pd.DataFrame({
    "encoded_feature": encoded_feature_names,
    "chi2_score": chi2_scores,
    "p_value": p_values
})

# Group by base feature name
chi2_df["feature"] = chi2_df["encoded_feature"].apply(lambda x: "_".join(x.split("_")[:-1]))

feature_summary = chi2_df.groupby("feature").agg(
    num_categories=("encoded_feature", "count"),
    num_significant=("p_value", lambda x: (x < 0.05).sum()),
    avg_chi2_score=("chi2_score", "mean")
).reset_index()

feature_summary["action"] = feature_summary["num_significant"].apply(
    lambda x: "Keep" if x > 0 else "Drop"
)

# Show results
# Show all features with their signal summary
print("\n=== Full Categorical Feature Signal Summary ===")
print(feature_summary.sort_values("avg_chi2_score", ascending=False))

# Print clearly which to Drop
drop_features = feature_summary[feature_summary["action"] == "Drop"]["feature"].tolist()
print("\n❌ Recommended to DROP (No significant predictive power):")
for feat in drop_features:
    print(f" - {feat}")

# Optional: Also show ones to Keep
keep_features = feature_summary[feature_summary["action"] == "Keep"]["feature"].tolist()
print("\n✅ Recommended to KEEP (Has predictive signal):")
for feat in keep_features:
    print(f" - {feat}")