In [2]:
# ==============================================================================
# Cell 1: Setup, Imports, and Global Configuration
# ==============================================================================

# Step 0: Install necessary packages quietly
!pip install pandas openpyxl xgboost tensorflow > /dev/null
print("✅ Packages installed successfully.")

# Step 1: Import all required libraries
import pandas as pd
import numpy as np
import os
import sys
import time
import joblib
import warnings
import random

# Scikit-learn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import f_classif, chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    log_loss,
)

# Model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# TensorFlow / Keras imports
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, regularizers
except ImportError:
    print("TensorFlow is not installed. The MLP model will be skipped.")
    tf = None

# Global settings for reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
if tf is not None:
    tf.random.set_seed(SEED)
    # The line below is deprecated and may cause issues in newer TF versions, but kept for legacy compatibility.
    # For modern TF, tf.random.set_seed() is generally sufficient for determinism.
    # tf.config.experimental.enable_op_determinism()

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Create directories to store results
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("feature_selected_csvs", exist_ok=True)

print("✅ Libraries imported and global settings configured.")

The system cannot find the path specified.


✅ Packages installed successfully.
✅ Libraries imported and global settings configured.


In [3]:
# ==============================================================================
# Cell 2: Data Preprocessing Function
# ==============================================================================
def preprocess_dataframe(df_in):
    """
    Performs essential preprocessing: one-hot encoding for categoricals
    and median imputation for missing numerical values.
    """
    print("--- Running initial data preprocessing ---")
    try:
        # Find the target column, ignoring case and whitespace
        target_col = next((c for c in df_in.columns if c.strip().lower() == "target"), None)
        if target_col is None:
            raise KeyError("Couldn't find 'Target' column for preprocessing.")

        X = df_in.drop(columns=[target_col])
        y = df_in[target_col]

        # One-hot encode categorical features (if any exist)
        X_enc = pd.get_dummies(X, drop_first=True)

        # Impute missing values with the median for all numeric columns
        X_enc = X_enc.replace([np.inf, -np.inf], np.nan).fillna(X_enc.median(numeric_only=True))

        # Recombine features and target
        df_processed = X_enc.copy()
        df_processed[target_col] = y.values
        print(f"Preprocessing complete. Shape changed from {df_in.shape} to {df_processed.shape}.")
        return df_processed
    except Exception as e:
        print(f"❌ Error during preprocessing: {e}")
        return None

In [4]:
# ==============================================================================
# Cell 3: Feature Selection Functions
# ==============================================================================
def run_anova_selection(df_in, eta2_thresh, omega2_thresh):
    """Performs ANOVA F-test feature selection and saves the result."""
    out_path = f"feature_selected_csvs/FS_ANOVA_eta2_{eta2_thresh}_omega2_{omega2_thresh}.csv"
    print(f"\n--- Running ANOVA (eta2>{eta2_thresh}, omega2>{omega2_thresh}) ---")
    try:
        target_col = next((c for c in df_in.columns if c.strip().lower() == "target"), None)
        if target_col is None: raise KeyError("Couldn't find 'Target' column.")
        X_enc = df_in.drop(columns=[target_col]) # Data is already encoded
        y = pd.to_numeric(df_in[target_col], errors="raise").astype(int)
        F, p = f_classif(X_enc, y)
        k, n = y.nunique(), len(y)
        df_between, df_within = k - 1, n - k
        eta2 = (F * df_between) / (F * df_between + df_within)
        omega2 = np.clip((df_between * (F - 1)) / (df_between * F + df_within + 1), 0, None)
        metrics = pd.DataFrame({"feature": X_enc.columns, "eta2": eta2, "omega2": omega2})
        selected = metrics.query(f"eta2 > {eta2_thresh} and omega2 > {omega2_thresh}")["feature"].tolist()
        if not selected:
            print(f"Result: No features selected. Skipping file generation.")
            return None, 0
        df_out = X_enc[selected].copy()
        df_out[target_col] = y.values
        df_out.to_csv(out_path, sep=";", index=False)
        print(f"Result: Selected {len(selected)} features. Saved to {out_path}")
        return out_path, len(selected)
    except Exception as e:
        print(f"Error during ANOVA selection: {e}")
        return None, 0

def run_pearson_selection(df_in, target_corr_thresh):
    """Performs Pearson correlation-based feature selection."""
    out_path = f"feature_selected_csvs/FS_Pearson_targetCorr_{target_corr_thresh}.csv"
    print(f"\n--- Running Pearson's (|r|>{target_corr_thresh}) ---")
    try:
        df = df_in.copy()
        target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
        if target_col is None: raise KeyError("Couldn't find 'Target' column.")
        df[target_col] = pd.to_numeric(df[target_col], errors='coerce').fillna(-1).astype(int)
        df = df[df[target_col].isin([0, 1])].copy()
        y = df[target_col]
        X_enc = df.drop(columns=[target_col]) # Data is already encoded
        X_enc = X_enc.loc[:, X_enc.nunique() > 1]
        corr_with_target = X_enc.corrwith(y).abs()
        selected = corr_with_target[corr_with_target >= target_corr_thresh].index.tolist()
        if not selected:
            print(f"Result: No features selected. Skipping file generation.")
            return None, 0
        df_out = X_enc[selected].copy()
        df_out[target_col] = y.values
        df_out.to_csv(out_path, sep=";", index=False)
        print(f"Result: Selected {len(selected)} features. Saved to {out_path}")
        return out_path, len(selected)
    except Exception as e:
        print(f"Error during Pearson selection: {e}")
        return None, 0

def run_chi2_selection(df_in, p_value_thresh):
    """Performs Chi-Square feature selection."""
    out_path = f"feature_selected_csvs/FS_ChiSquare_p_{p_value_thresh}.csv"
    print(f"\n--- Running Chi-Square (p<{p_value_thresh}) ---")
    try:
        target_col = next((c for c in df_in.columns if c.strip().lower() == "target"), None)
        if target_col is None: raise KeyError("Couldn't find 'Target' column.")
        X = df_in.drop(columns=[target_col]) # Data is already encoded/numeric
        y = df_in[target_col]
        scaler = MinMaxScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        _, p_values = chi2(X_scaled, y)
        results = pd.DataFrame({"Feature": X.columns, "p-value": p_values})
        selected = results[results["p-value"] < p_value_thresh]["Feature"].tolist()
        if not selected:
            print(f"Result: No features selected. Skipping file generation.")
            return None, 0
        df_out = X[selected].copy()
        df_out[target_col] = y.values
        df_out.to_csv(out_path, sep=";", index=False)
        print(f"Result: Selected {len(selected)} features. Saved to {out_path}")
        return out_path, len(selected)
    except Exception as e:
        print(f"Error during Chi-Square selection: {e}")
        return None, 0

In [8]:
# ==============================================================================
# Cell 4: Machine Learning Model Functions (Scikit-learn & XGBoost)
# ==============================================================================

def calculate_sklearn_metrics(model, X_train, y_train, X_test, y_test):
    """A helper function for scikit-learn models."""
    results = {}
    y_train_pred = model.predict(X_train)
    results['Train_Accuracy'] = accuracy_score(y_train, y_train_pred)
    try:
        y_train_proba = model.predict_proba(X_train)
        results['Train_LogLoss'] = log_loss(y_train, y_train_proba)
        results['Train_ROC_AUC'] = roc_auc_score(y_train, y_train_proba[:, 1])
    except Exception:
        results['Train_LogLoss'], results['Train_ROC_AUC'] = None, None
    y_test_pred = model.predict(X_test)
    results['Test_Accuracy'] = accuracy_score(y_test, y_test_pred)
    try:
        y_test_proba = model.predict_proba(X_test)
        results['Test_LogLoss'] = log_loss(y_test, y_test_proba)
        results['Test_ROC_AUC'] = roc_auc_score(y_test, y_test_proba[:, 1])
    except Exception:
        results['Test_LogLoss'], results['Test_ROC_AUC'] = None, None
    results['Test_Classification_Report'] = classification_report(y_test, y_test_pred, output_dict=True)
    results['Test_Confusion_Matrix'] = str(confusion_matrix(y_test, y_test_pred).tolist())
    return results

def train_evaluate_rf(X_train, y_train, X_test, y_test):
    """Trains and evaluates a RandomForestClassifier with robust anti-overfitting parameters."""
    print("Training RandomForest...")
    model = RandomForestClassifier(
        n_estimators=200, max_depth=8, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt', class_weight="balanced",
        random_state=SEED, n_jobs=-1
    )
    model.fit(X_train, y_train)
    return calculate_sklearn_metrics(model, X_train, y_train, X_test, y_test)

def train_evaluate_dt(X_train, y_train, X_test, y_test):
    """Trains and evaluates a DecisionTreeClassifier with GridSearchCV."""
    print("Training Decision Tree with GridSearchCV (this may take a while)...")
    pipe = Pipeline([("scaler", StandardScaler()), ("clf", DecisionTreeClassifier(random_state=SEED))])
    param_grid = {"clf__max_depth": [5, 10], "clf__min_samples_split": [2, 10], "clf__class_weight": [None, "balanced"]}
    grid = GridSearchCV(pipe, param_grid, scoring="f1_weighted", cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)
    return calculate_sklearn_metrics(grid.best_estimator_, X_train, y_train, X_test, y_test)

def train_evaluate_lr(X_train, y_train, X_test, y_test):
    """Trains and evaluates a LogisticRegression model."""
    print("Training Logistic Regression...")
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('model', LogisticRegression(max_iter=1000, class_weight="balanced", random_state=SEED))])
    pipe.fit(X_train, y_train)
    return calculate_sklearn_metrics(pipe, X_train, y_train, X_test, y_test)

def train_evaluate_xgb(X_train, y_train, X_test, y_test):
    """Trains and evaluates an XGBClassifier with robust anti-overfitting parameters."""
    print("Training XGBoost...")
    neg, pos = np.sum(y_train == 0), np.sum(y_train == 1)
    scale_pos_weight = neg/pos if pos > 0 else 1
    model = XGBClassifier(
        objective="binary:logistic", use_label_encoder=False, eval_metric='logloss',
        n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8,
        colsample_bytree=0.8, gamma=0.1, reg_alpha=0.1, reg_lambda=0.1,
        random_state=SEED, n_jobs=-1, scale_pos_weight=scale_pos_weight
    )
    model.fit(X_train, y_train)
    return calculate_sklearn_metrics(model, X_train, y_train, X_test, y_test)

def train_evaluate_svm(X_train, y_train, X_test, y_test):
    """Trains and evaluates an SVC model."""
    print("Training SVM...")
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('model', SVC(class_weight='balanced', probability=True, random_state=SEED))])
    pipe.fit(X_train, y_train)
    return calculate_sklearn_metrics(pipe, X_train, y_train, X_test, y_test)

In [5]:
# ==============================================================================
# Cell 5: Machine Learning Model Function (TensorFlow/Keras)
# ==============================================================================

def train_evaluate_mlp(X_train, y_train, X_test, y_test):
    """Trains and evaluates a Multi-Layer Perceptron with 100% reproducibility."""
    if tf is None: return {}
    print("Training MLP (TensorFlow) with deterministic settings...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Re-set seed specifically for Keras model construction
    tf.random.set_seed(SEED)

    model = models.Sequential([
        layers.Input(shape=(X_train_scaled.shape[1],)),
        layers.Dense(128, activation='relu'), layers.Dropout(0.5),
        layers.Dense(64, activation='relu'), layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    early_stopping = callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=100,
              batch_size=64, callbacks=[early_stopping], verbose=0)

    results = {}
    train_loss, train_acc, train_auc = model.evaluate(X_train_scaled, y_train, verbose=0)
    results['Train_Accuracy'], results['Train_LogLoss'], results['Train_ROC_AUC'] = train_acc, train_loss, train_auc
    test_loss, test_acc, test_auc = model.evaluate(X_test_scaled, y_test, verbose=0)
    results['Test_Accuracy'], results['Test_LogLoss'], results['Test_ROC_AUC'] = test_acc, test_loss, test_auc
    y_test_pred_proba = model.predict(X_test_scaled, verbose=0)
    y_test_pred_labels = (y_test_pred_proba > 0.5).astype("int32")
    results['Test_Classification_Report'] = classification_report(y_test, y_test_pred_labels, output_dict=True)
    results['Test_Confusion_Matrix'] = str(confusion_matrix(y_test, y_test_pred_labels).tolist())
    return results

In [11]:
# ==============================================================================
# Cell 6: Main Execution Workflow
# ==============================================================================

def main():
    """Main function to orchestrate the entire workflow."""
    print("="*60)
    print("      COMPREHENSIVE FEATURE SELECTION & ML PIPELINE")
    print("="*60)
    print("\nSTEP 1: UPLOAD YOUR DATASET")
    print("-" * 60)
    print("Please use the button below to upload your CSV file.")

    input_filename = r"data/dropoutgraduate.csv"
    print(f"\n✅ Successfully uploaded '{input_filename}'.")
    try:
        df_original = pd.read_csv(input_filename, sep=';')
    except Exception:
        try:
            df_original = pd.read_csv(input_filename, sep=',')
        except Exception as e:
            print(f"\n❌ Error: Could not read the CSV file. Details: {e}")
            return

    # Preprocess data ONCE at the beginning
    df_processed = preprocess_dataframe(df_original.copy())
    if df_processed is None:
        print("Halting execution due to preprocessing error.")
        return

    print("\nSTEP 2: SETTING UP DATASETS FOR TRAINING")
    print("-" * 60)

    generated_files = []
    try:
        original_data_path = "feature_selected_csvs/FS_None_All_Features.csv"
        # Save the PROCESSED dataframe for the baseline run
        df_processed.to_csv(original_data_path, sep=";", index=False)
        target_col_name = next((c for c in df_processed.columns if c.strip().lower() == "target"), None)
        num_processed_features = len(df_processed.columns) - 1
        generated_files.append({
            "path": original_data_path, "method": "None",
            "thresholds": "All Features", "num_features": num_processed_features
        })
        print("\n--- Baseline Added: Using all preprocessed features ---")
    except Exception as e:
        print(f"❌ Error setting up the 'no feature selection' baseline: {e}")

    anova_thresholds = [(0.01, 0.01), (0.04, 0.04), (0.06, 0.06), (0.08, 0.08), (0.14, 0.14)]
    pearson_thresholds = [0.05, 0.08, 0.10, 0.15, 0.30]
    chi2_thresholds = [0.1, 0.07, 0.05, 0.03, 0.01]

    # Run feature selection on the preprocessed dataframe
    for eta2, omega2 in anova_thresholds:
        path, count = run_anova_selection(df_processed.copy(), eta2, omega2)
        if path: generated_files.append({"path": path, "method": "ANOVA", "thresholds": f"eta2>{eta2}, omega2>{omega2}", "num_features": count})
    for thresh in pearson_thresholds:
        path, count = run_pearson_selection(df_processed.copy(), thresh)
        if path: generated_files.append({"path": path, "method": "Pearson", "thresholds": f"|r|>{thresh}", "num_features": count})
    for thresh in chi2_thresholds:
        path, count = run_chi2_selection(df_processed.copy(), thresh)
        if path: generated_files.append({"path": path, "method": "Chi-Square", "thresholds": f"p<{thresh}", "num_features": count})

    if not generated_files:
        print("\n❌ Error: No datasets to process. Halting.")
        return

    print(f"\n✅ Setup complete. Processing {len(generated_files)} unique datasets (including baseline).")

    print("\nSTEP 3: TRAINING AND EVALUATING ML MODELS")
    print("-" * 60)
    all_results = []
    models_to_run = {
        "RandomForest": train_evaluate_rf, "DecisionTree_GridSearchCV": train_evaluate_dt,
        "LogisticRegression": train_evaluate_lr, "XGBoost": train_evaluate_xgb,
        "SVM": train_evaluate_svm, "MLP_TensorFlow": train_evaluate_mlp,
    }
    total_tasks = len(generated_files) * len(models_to_run)
    tasks_completed = 0
    overall_start_time = time.time()
    for i, file_info in enumerate(generated_files):
        print(f"\nProcessing Dataset {i+1}/{len(generated_files)}: [Method: {file_info['method']}, Thresholds: {file_info['thresholds']}]")
        df_fs = pd.read_csv(file_info['path'], sep=";")
        target_col = next((c for c in df_fs.columns if c.strip().lower() == "target"), None)
        X = df_fs.drop(columns=[target_col])
        y = pd.to_numeric(df_fs[target_col], errors='coerce').fillna(-1).astype(int)
        valid_indices = y.isin([0, 1])
        X, y = X[valid_indices], y[valid_indices]
        if len(y.unique()) < 2:
            print(f"  Skipping this dataset as it contains only one class after filtering.")
            tasks_completed += len(models_to_run)
            continue
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
        for model_name, train_func in models_to_run.items():
            start_time = time.time()
            try:
                metrics = train_func(X_train.copy(), y_train.copy(), X_test.copy(), y_test.copy())
                if not metrics:
                    tasks_completed += 1
                    continue
                report = metrics.pop('Test_Classification_Report', {})
                for class_label, scores in report.items():
                    if isinstance(scores, dict):
                        for metric, value in scores.items():
                            metrics[f'Test_Report_{class_label}_{metric}'] = value
                    else:
                        metrics[f'Test_Report_{class_label}'] = scores
                result_row = {"FeatureSelectionMethod": file_info["method"], "Thresholds": file_info["thresholds"],
                              "NumSelectedFeatures": file_info["num_features"], "ML_Algorithm": model_name, **metrics}
                all_results.append(result_row)
            except Exception as e:
                print(f"  ❌ ERROR running {model_name} on {file_info['path']}: {e}")
            finally:
                tasks_completed += 1
                elapsed_time = time.time() - overall_start_time
                avg_time_per_task = elapsed_time / tasks_completed if tasks_completed > 0 else 0
                remaining_tasks = total_tasks - tasks_completed
                eta_seconds = remaining_tasks * avg_time_per_task
                eta_minutes, eta_sec_rem = divmod(eta_seconds, 60)
                eta_str = f"{int(eta_minutes)}m {int(eta_sec_rem)}s"
                model_time = time.time() - start_time
                print(f"  [{tasks_completed}/{total_tasks}] Finished {model_name} in {model_time:.2f}s. | ETA: {eta_str}")

    print("\nSTEP 4: SAVING FINAL RESULTS")
    print("-" * 60)
    if all_results:
        results_df = pd.DataFrame(all_results)
        cols_order = ["FeatureSelectionMethod", "Thresholds", "NumSelectedFeatures", "ML_Algorithm",
                      "Train_Accuracy", "Test_Accuracy", "Train_LogLoss", "Test_LogLoss",
                      "Train_ROC_AUC", "Test_ROC_AUC", "Test_Confusion_Matrix"]
        report_cols = sorted([c for c in results_df.columns if c.startswith('Test_Report_')])
        final_cols = cols_order + report_cols
        results_df = results_df.reindex(columns=final_cols).fillna('')
        output_excel_path = "data/ML_Evaluation_Results.xlsx"
        results_df.to_excel(output_excel_path, index=False)
        print(f"\n✅ All processing complete! Results saved to '{output_excel_path}'.")
    else:
        print("\nNo models were successfully trained. No results file generated.")

# This line calls the main function to start the script.
if __name__ == '__main__':
    main()

      COMPREHENSIVE FEATURE SELECTION & ML PIPELINE

STEP 1: UPLOAD YOUR DATASET
------------------------------------------------------------
Please use the button below to upload your CSV file.

✅ Successfully uploaded 'data/dropoutgraduate.csv'.
--- Running initial data preprocessing ---
Preprocessing complete. Shape changed from (3630, 37) to (3630, 37).

STEP 2: SETTING UP DATASETS FOR TRAINING
------------------------------------------------------------

--- Baseline Added: Using all preprocessed features ---

--- Running ANOVA (eta2>0.01, omega2>0.01) ---
Result: Selected 17 features. Saved to feature_selected_csvs/FS_ANOVA_eta2_0.01_omega2_0.01.csv

--- Running ANOVA (eta2>0.04, omega2>0.04) ---
Result: Selected 10 features. Saved to feature_selected_csvs/FS_ANOVA_eta2_0.04_omega2_0.04.csv

--- Running ANOVA (eta2>0.06, omega2>0.06) ---
Result: Selected 9 features. Saved to feature_selected_csvs/FS_ANOVA_eta2_0.06_omega2_0.06.csv

--- Running ANOVA (eta2>0.08, omega2>0.08) ---
R