In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif

# Define the path to the input CSV file
in_path = "data/dropoutgraduate.csv"

In [2]:
# Read the CSV file, using semicolon as a separator
df_in = pd.read_csv(in_path, sep=";")

# Find the 'Target' column, ignoring case (e.g., 'target', 'Target', 'TARGET')
target_col = next((c for c in df_in.columns if c.strip().lower() == "target"), None)

# If no target column is found, raise an error
if target_col is None:
    raise KeyError("Couldn't find a 'Target' column (case-insensitive) in the encoded CSV.")

In [3]:
# X contains all columns except for the identified target column
X = df_in.drop(columns=[target_col])

# y contains only the target column, ensuring it's treated as a numeric integer
y = pd.to_numeric(df_in[target_col], errors="raise").astype(int)

In [4]:
# One-hot encode non-numeric features to convert them into a numeric format
# drop_first=True helps avoid multicollinearity
X_enc = pd.get_dummies(X, drop_first=True)

# Clean the data: replace infinite values and fill NaNs with the column median
X_enc = X_enc.replace([np.inf, -np.inf], np.nan).fillna(X_enc.median(numeric_only=True))

In [5]:
# --- 3) ANOVA F-test and effect sizes (eta2, omega2) ---
F, p = f_classif(X_enc, y)

# Define parameters for effect size calculation
k = y.nunique()  # Number of groups/classes
n = len(y)       # Total number of samples
df_between = k - 1
df_within  = n - k

# eta^2: proportion of variance explained (0-1)
eta2 = (F * df_between) / (F * df_between + df_within)

# omega^2: bias-corrected effect size (0-1), clip tiny negatives to 0
omega2 = (df_between * (F - 1)) / (df_between * F + df_within + 1)
omega2 = np.clip(omega2, a_min=0, a_max=None)

# Store metrics in a DataFrame for easy filtering
metrics = pd.DataFrame({
    "feature": X_enc.columns,
    "eta2": eta2,
    "omega2": omega2,
})

In [6]:
# Select features where both eta2 and omega2 are greater than 0.06
selected = metrics.query("eta2 > 0.06 and omega2 > 0.06")["feature"].tolist()

# If no features meet the criteria, raise an error
if len(selected) == 0:
    raise ValueError("No features passed eta2>0.06 and omega2>0.06. "
                     "Lower the thresholds or review preprocessing.")

In [7]:
# Create the output DataFrame with only the selected features
df_out = X_enc[selected].copy()

# --- Find where the Target column should be re-inserted ---
# Get the original index of the Target column
orig_target_idx = df_in.columns.get_loc(target_col)
# Get the list of features that were originally to the left of the Target
left_of_target_features = list(df_in.columns[:orig_target_idx])

# Map each original feature to its one-hot encoded column(s)
enc_cols = X_enc.columns.tolist()
feature_to_encoded = {}
for feat in X.columns:
    cols_from_feat = []
    if feat in enc_cols: # For numeric features
        cols_from_feat.append(feat)
    prefix = f"{feat}_" # For one-hot encoded categorical features
    cols_from_feat.extend([c for c in enc_cols if c.startswith(prefix)])
    feature_to_encoded[feat] = cols_from_feat

# Count how many of the *selected* encoded columns came from features to the left of Target
selected_set = set(selected)
pos = 0
for feat in left_of_target_features:
    for col in feature_to_encoded.get(feat, []):
        if col in selected_set:
            pos += 1
# 'pos' is now the correct insertion index

In [8]:
# Insert the Target column back into the DataFrame at the determined position
df_out.insert(pos, target_col, y.values)

# Define the output file path and save the new DataFrame
out_path = "data/ExtractedA.csv"
df_out.to_csv(out_path, sep=";", index=False, encoding="utf-8")

print(f"Selected {len(selected)} features. Inserted Target at index {pos}. New file -> {out_path}")

Selected 9 features. Inserted Target at index 9. New file -> data/ExtractedA.csv


In [9]:
print(selected)

['Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)']
