In [1]:
import pandas as pd
import numpy as np

# ===== thresholds (tune as needed) =====
target_corr_threshold = 0.30   # keep features with |corr(feature, Target)| >= this
feature_corr_threshold = 0.95  # drop a feature if it's > this correlated with any selected feature

# ===== IO =====
in_path  = "data/dropoutgraduate.csv"                 # change if needed
out_path = "data/ExtractedP.csv"   # Target will be last/right-most column

In [2]:
# ===== Load =====
df = pd.read_csv(in_path, sep=";")
logs = []  # only log when an action actually happens

In [3]:
# ===== Step 1: Locate / sanitize Target =====
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if target_col is None:
    raise KeyError("Couldn't find a 'Target' column (case-insensitive).")

y_raw = df[target_col]
y_num = pd.to_numeric(y_raw, errors="coerce")

# Map non-numeric labels to numeric codes if present
if y_num.isna().any():
    label_to_code = {"dropout": 0, "graduate": 1, "enrolled": 2}
    mapped = y_raw.astype(str).str.strip().str.lower().map(label_to_code)
    n_mapped = mapped.notna().sum()
    if n_mapped > 0:
        logs.append(f"[Step 1] Detected non-numeric Target values. Mapped labels to codes for {int(n_mapped)} rows.")
        y_num = mapped

# Fail if unknown remain
if y_num.isna().any():
    unknown = sorted(pd.Series(y_raw[y_num.isna()].unique()).astype(str).tolist())
    raise ValueError(f"[Step 1] Unrecognized Target values: {unknown}. Please clean them first.")

df[target_col] = y_num.astype(int)

# Keep only binary {0,1} for point-biserial context; drop class 2 if present
n_before = len(df)
df = df[df[target_col].isin([0, 1])].copy()
removed = n_before - len(df)
if removed > 0:
    logs.append(f"[Step 1] Removed {removed} rows with Target=2 to keep a binary target (0/1).")

y = df[target_col].astype(int)

In [4]:
# ===== Step 2: Feature encoding & cleaning =====
X = df.drop(columns=[target_col])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

X_enc = pd.get_dummies(X, drop_first=True)

# Log dummy count (if any)
enc_total = X_enc.shape[1]
orig_num = len(num_cols)
dummy_added = enc_total - orig_num
if dummy_added > 0:
    logs.append(f"[Step 2] One-hot encoded {len(cat_cols)} categorical column(s), added {dummy_added} dummy feature(s).")

# Replace ±inf with NaN
inf_count = int(np.isinf(X_enc.select_dtypes(include=[np.number]).to_numpy()).sum()) if enc_total > 0 else 0
if inf_count > 0:
    logs.append(f"[Step 2] Found {inf_count} ±inf value(s); replaced with NaN.")
    X_enc = X_enc.replace([np.inf, -np.inf], np.nan)

# Fill NaNs with column medians
nan_before = int(X_enc.isna().sum().sum())
if nan_before > 0:
    logs.append(f"[Step 2] Detected {nan_before} missing value(s); filled with column medians.")
    X_enc = X_enc.fillna(X_enc.median(numeric_only=True))

# Drop constant columns
const_cols = X_enc.columns[X_enc.nunique() <= 1].tolist()
if const_cols:
    preview = ", ".join(const_cols[:10]) + ("…" if len(const_cols) > 10 else "")
    logs.append(f"[Step 2] Dropped {len(const_cols)} constant column(s): {preview}")
    X_enc = X_enc.drop(columns=const_cols)

if X_enc.shape[1] == 0:
    raise ValueError("[Step 2] No usable (non-constant) features after encoding/cleaning.")

In [5]:
# ===== Print preprocessing log =====
print("=== Preprocessing Log ===")
if logs:
    for m in logs:
        print(m)
else:
    print("No actions were performed in Step 1 and Step 2.")
print("=== End of Log ===\n")

=== Preprocessing Log ===
No actions were performed in Step 1 and Step 2.
=== End of Log ===



In [6]:
# ===== Step 3: Pearson correlation vs Target =====
corr_with_target = X_enc.apply(lambda s: s.corr(y))  # equals point-biserial for binary y
corr_df = (
    pd.DataFrame({"feature": X_enc.columns, "r": corr_with_target.values})
    .assign(abs_r=lambda d: d["r"].abs())
    .sort_values("abs_r", ascending=False)
)

print(f"Total usable features: {X_enc.shape[1]}")
print("Top 15 by |r(Target)|:")
print(corr_df[["feature", "r"]].head(15).to_string(index=False))

Total usable features: 36
Top 15 by |r(Target)|:
                               feature         r
   Curricular units 2nd sem (approved)  0.653995
      Curricular units 2nd sem (grade)  0.605350
   Curricular units 1st sem (approved)  0.554881
      Curricular units 1st sem (grade)  0.519927
               Tuition fees up to date  0.442138
                    Scholarship holder  0.313018
                     Age at enrollment -0.267229
                                Debtor -0.267207
                                Gender -0.251955
                      Application mode -0.244507
   Curricular units 2nd sem (enrolled)  0.182897
   Curricular units 1st sem (enrolled)  0.161074
                       Admission grade  0.128058
                             Displaced  0.126113
Curricular units 2nd sem (evaluations)  0.119239


In [7]:
# ===== Step 4: Select by |r(Target)| and remove redundancy =====
candidates = corr_df.loc[corr_df["abs_r"] >= target_corr_threshold, "feature"].tolist()
if len(candidates) == 0:
    raise ValueError(f"No features pass |r(Target)| >= {target_corr_threshold}. "
                     "Lower target_corr_threshold or check preprocessing.")

X_cand = X_enc[candidates]
corr_mat = X_cand.corr().abs()

selected = []
for feat in corr_df.loc[corr_df["feature"].isin(candidates), "feature"]:
    if all(corr_mat.loc[feat, s] <= feature_corr_threshold for s in selected):
        selected.append(feat)

if len(selected) == 0:
    raise ValueError(f"All candidates were removed by inter-feature correlation > {feature_corr_threshold}. "
                     "Increase feature_corr_threshold.")

In [8]:
# ===== Step 5: Save selected features + Target AS LAST COLUMN =====
df_out = X_enc[selected].copy()
df_out[target_col] = y.values  # append Target to the right-most position
df_out.to_csv(out_path, sep=";", index=False, encoding="utf-8")

In [10]:
# ===== Summary =====
print("\n=== Selection Summary ===")
print(f"Candidates by |r(Target)| ≥ {target_corr_threshold}: {len(candidates)}")
print(f"Selected after redundancy filter (|corr| ≤ {feature_corr_threshold}): {len(selected)}")
print(f"Saved CSV (features + Target as last column) -> {out_path}")
print(f"First {len(selected)} selected features:")
print(pd.Series(selected[:20]).to_string(index=False))


=== Selection Summary ===
Candidates by |r(Target)| ≥ 0.3: 6
Selected after redundancy filter (|corr| ≤ 0.95): 6
Saved CSV (features + Target as last column) -> data/ExtractedP.csv
First 6 selected features:
Curricular units 2nd sem (approved)
   Curricular units 2nd sem (grade)
Curricular units 1st sem (approved)
   Curricular units 1st sem (grade)
            Tuition fees up to date
                 Scholarship holder
