In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from boruta import BorutaPy
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import time
from shap_select import shap_select
from skfeature.function.information_theoretical_based import MRMR
from shap_selection import feature_selection

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Global XGBoost parameters for consistency
XGB_PARAMS = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "verbosity": 0,
    "seed": RANDOM_SEED,
    "nthread": 1
}

# Define common XGBoost model
def train_xgboost(X_train, y_train):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(XGB_PARAMS, dtrain, num_boost_round=100)
    return xgb_model

def predict_xgboost(xgb_model, X_val):
    dval = xgb.DMatrix(X_val)
    y_pred = (xgb_model.predict(dval) > 0.5).astype(int)
    return y_pred

# HISEL feature selection using MRMR
def hisel_feature_selection(X_train, y_train, num_features):
    selected_idx = MRMR.mrmr(X_train.values, y_train.values, n_selected_features=num_features)
    return X_train.columns[selected_idx].tolist()

# Run experiments with different feature selection methods and shap-select p-values
def run_experiments():
    print("Loading dataset...")
    df = pd.read_csv('creditcard.csv')
    X = df.drop(columns=['Class'])
    y = df['Class']

    # Perform a 60-20-20 split for train, validation, and test sets
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=RANDOM_SEED)

    results = []
    shapely_select_results = []

    # No Feature Selection
    print("\n--- No Feature Selection ---")
    start_time = time.time()
    xgb_model = train_xgboost(X_train, y_train)
    y_pred = predict_xgboost(xgb_model, X_test)
    runtime = time.time() - start_time
    print(f"No Feature Selection completed in {runtime:.2f} seconds with {len(X_train.columns)} features.")
    results.append({
        'Method': 'No Feature Selection',
        'Selected Features': len(X_train.columns),
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'Runtime (s)': runtime
    })

    # Shapely Select with different p-values
    for p_value in [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1]:
        print(f"\n--- Shapely Select with p-value={p_value} ---")
        start_time = time.time()
        shap_features, _ = shap_select(xgb_model, X_val, y_val, task="binary", alpha=1e-6, threshold=p_value, return_extended_data=True)
        selected_features = shap_features[shap_features["selected"] == 1]["feature name"].tolist()
        runtime = time.time() - start_time
        print(f"Shapely Select with p-value {p_value} completed in {runtime:.2f} seconds with {len(selected_features)} selected features: {selected_features}")

        xgb_model_shap = train_xgboost(X_train[selected_features], y_train)
        y_pred_shap = predict_xgboost(xgb_model_shap, X_test[selected_features])

        f1_score_shap = f1_score(y_test, y_pred_shap)
        shapely_select_results.append({'p_value': p_value, 'F1 Score': f1_score_shap})

        results.append({
            'Method': f'Shap-Select (p={int(p_value*100)}%)',
            'Selected Features': len(selected_features),
            'Accuracy': accuracy_score(y_test, y_pred_shap),
            'F1 Score': f1_score_shap,
            'Runtime (s)': runtime
        })

    # Plot F1 Score vs p-value for Shapely Select
    plt.figure(figsize=(8, 6))
    p_values = [entry['p_value'] for entry in shapely_select_results]
    f1_scores = [entry['F1 Score'] for entry in shapely_select_results]
    plt.scatter(p_values, f1_scores, color='blue')
    plt.plot(p_values, f1_scores, linestyle='-', color='blue')
    plt.xlabel("p-value Threshold")
    plt.ylabel("F1 Score")
    plt.title("Shap-Select: F1 Score vs p-value Threshold")
    plt.savefig("shapely_select_p_value_benchmark.png")
    print("Saved plot for Shapely Select p-value benchmarking.")

    # SHAP Selection
    print("\n--- SHAP Selection ---")
    start_time = time.time()
    selected_shap_selection, _ = feature_selection.shap_select(xgb_model, X_train, X_val, X_train.columns, agnostic=False)
    selected_shap_selection = selected_shap_selection[:15]
    shap_selection_time = time.time() - start_time
    print(f"SHAP Selection completed in {shap_selection_time:.2f} seconds with {len(selected_shap_selection)} selected features: {selected_shap_selection}")

    xgb_model_shap_selection = train_xgboost(X_train[selected_shap_selection], y_train)
    y_pred_shap_selection = predict_xgboost(xgb_model_shap_selection, X_test[selected_shap_selection])

    results.append({
        'Method': 'SHAP Selection',
        'Selected Features': len(selected_shap_selection),
        'Accuracy': accuracy_score(y_test, y_pred_shap_selection),
        'F1 Score': f1_score(y_test, y_pred_shap_selection),
        'Runtime (s)': shap_selection_time
    })

    # RFE
    print("\n--- RFE ---")
    rfe_start_time = time.time()
    rfe = RFE(xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False), n_features_to_select=15)
    rfe.fit(X_train, y_train)
    selected_rfe = X_train.columns[rfe.support_]
    rfe_time = time.time() - rfe_start_time
    print(f"RFE completed in {rfe_time:.2f} seconds with {len(selected_rfe)} selected features: {selected_rfe}")

    xgb_model_rfe = train_xgboost(X_train[selected_rfe], y_train)
    y_pred_rfe = predict_xgboost(xgb_model_rfe, X_test[selected_rfe])

    results.append({
        'Method': 'RFE',
        'Selected Features': len(selected_rfe),
        'Accuracy': accuracy_score(y_test, y_pred_rfe),
        'F1 Score': f1_score(y_test, y_pred_rfe),
        'Runtime (s)': rfe_time
    })

    # HISEL
    print("\n--- HISEL ---")
    hisel_start_time = time.time()
    selected_hisel = hisel_feature_selection(X_train, y_train, num_features=15)
    hisel_time = time.time() - hisel_start_time
    print(f"HISEL completed in {hisel_time:.2f} seconds with {len(selected_hisel)} selected features: {selected_hisel}")

    xgb_model_hisel = train_xgboost(X_train[selected_hisel], y_train)
    y_pred_hisel = predict_xgboost(xgb_model_hisel, X_test[selected_hisel])

    results.append({
        'Method': 'HISEL',
        'Selected Features': len(selected_hisel),
        'Accuracy': accuracy_score(y_test, y_pred_hisel),
        'F1 Score': f1_score(y_test, y_pred_hisel),
        'Runtime (s)': hisel_time
    })

    # Boruta
    print("\n--- Boruta ---")
    boruta_start_time = time.time()
    rf_model = xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False)
    boruta_selector = BorutaPy(rf_model, n_estimators=100, random_state=RANDOM_SEED)
    boruta_selector.fit(X_train.values, y_train.values)
    selected_boruta = X_train.columns[boruta_selector.support_].tolist()
    boruta_time = time.time() - boruta_start_time
    print(f"Boruta completed in {boruta_time:.2f} seconds with {len(selected_boruta)} selected features: {selected_boruta}")

    xgb_model_boruta = train_xgboost(X_train[selected_boruta], y_train)
    y_pred_boruta = predict_xgboost(xgb_model_boruta, X_test[selected_boruta])

    results.append({
        'Method': 'Boruta',
        'Selected Features': len(selected_boruta),
        'Accuracy': accuracy_score(y_test, y_pred_boruta),
        'F1 Score': f1_score(y_test, y_pred_boruta),
        'Runtime (s)': boruta_time
    })

    assert set(X_train.columns) == set(selected_hisel), "Feature sets differ!"



    results_df = pd.DataFrame(results)
    print("\n--- Experiment Results ---")
    print(results_df)
    return results_df

   
run_experiments()