In [3]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import time
from shap_select import shap_select
from skfeature.function.information_theoretical_based import MRMR
from shap_selection import feature_selection

# Define common XGBoost model
def train_xgboost(X_train, y_train):
    params = {"objective": "binary:logistic", "eval_metric": "logloss", "verbosity": 0}
    dtrain = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(params, dtrain, num_boost_round=100)
    return xgb_model

def predict_xgboost(xgb_model, X_val):
    dval = xgb.DMatrix(X_val)
    y_pred = (xgb_model.predict(dval) > 0.5).astype(int)
    return y_pred

# HISEL feature selection using MRMR
def hisel_feature_selection(X_train, y_train, num_features):
    selected_idx = MRMR.mrmr(X_train.values, y_train.values, n_selected_features=num_features)
    return X_train.columns[selected_idx].tolist()

# Run experiments
def run_experiments():
    print("Loading dataset...")
    df = pd.read_csv('creditcard.csv')
    X = df.drop(columns=['Class'])
    y = df['Class']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

    # No Feature Selection
    print("\n--- No Feature Selection ---")
    start_time = time.time()
    xgb_model = train_xgboost(X_train, y_train)
    y_pred = predict_xgboost(xgb_model, X_val)
    no_fs_time = time.time() - start_time
    no_fs_accuracy = accuracy_score(y_val, y_pred)
    no_fs_f1 = f1_score(y_val, y_pred)
    print(f"No Feature Selection completed in {no_fs_time:.2f} seconds with {len(X_train.columns)} features.")

    # Shapely Select
    print("\n--- Shapely Select ---")
    start_time = time.time()
    shap_features, _ = shap_select(xgb_model, X_val, y_val, task="binary", alpha=1e-6, threshold=0.05, return_extended_data=True)
    selected_shapely = shap_features[shap_features["selected"] == 1]["feature name"].tolist()
    shap_time = time.time() - start_time
    print(f"Shapely Select completed in {shap_time:.2f} seconds with {len(selected_shapely)} selected features.")
    
    xgb_model_shap = train_xgboost(X_train[selected_shapely], y_train)
    y_pred_shap = predict_xgboost(xgb_model_shap, X_val[selected_shapely])
    shap_accuracy = accuracy_score(y_val, y_pred_shap)
    shap_f1 = f1_score(y_val, y_pred_shap)

    # SHAP Selection
    print("\n--- SHAP Selection ---")
    start_time = time.time()
    selected_shap_selection, _ = feature_selection.shap_select(xgb_model, X_train, X_val, X_train.columns, agnostic=False)
    selected_shap_selection = selected_shap_selection[:15]
    shap_selection_time = time.time() - start_time
    print(f"SHAP Selection completed in {shap_selection_time:.2f} seconds with {len(selected_shap_selection)} selected features.")
    
    xgb_model_shap_selection = train_xgboost(X_train[selected_shap_selection], y_train)
    y_pred_shap_selection = predict_xgboost(xgb_model_shap_selection, X_val[selected_shap_selection])
    shap_selection_accuracy = accuracy_score(y_val, y_pred_shap_selection)
    shap_selection_f1 = f1_score(y_val, y_pred_shap_selection)

    # RFE
    print("\n--- RFE ---")
    rfe_start_time = time.time()
    rfe = RFE(xgb.XGBClassifier(use_label_encoder=False), n_features_to_select=15)
    rfe.fit(X_train, y_train)
    selected_rfe = X_train.columns[rfe.support_]
    rfe_time = time.time() - rfe_start_time
    print(f"RFE completed in {rfe_time:.2f} seconds with {len(selected_rfe)} selected features.")
    
    xgb_model_rfe = train_xgboost(X_train[selected_rfe], y_train)
    y_pred_rfe = predict_xgboost(xgb_model_rfe, X_val[selected_rfe])
    rfe_accuracy = accuracy_score(y_val, y_pred_rfe)
    rfe_f1 = f1_score(y_val, y_pred_rfe)

    # HISEL
    print("\n--- HISEL ---")
    hisel_start_time = time.time()
    selected_hisel = hisel_feature_selection(X_train, y_train, num_features=15)
    hisel_time = time.time() - hisel_start_time
    print(f"HISEL completed in {hisel_time:.2f} seconds with {len(selected_hisel)} selected features.")
    
    xgb_model_hisel = train_xgboost(X_train[selected_hisel], y_train)
    y_pred_hisel = predict_xgboost(xgb_model_hisel, X_val[selected_hisel])
    hisel_accuracy = accuracy_score(y_val, y_pred_hisel)
    hisel_f1 = f1_score(y_val, y_pred_hisel)

    # Boruta
    print("\n--- Boruta ---")
    boruta_start_time = time.time()
    rf_model = xgb.XGBClassifier(use_label_encoder=False)
    boruta_selector = BorutaPy(rf_model, n_estimators='auto', random_state=1)
    boruta_selector.fit(X_train.values, y_train.values)
    selected_boruta = X_train.columns[boruta_selector.support_].tolist()
    boruta_time = time.time() - boruta_start_time
    print(f"Boruta completed in {boruta_time:.2f} seconds with {len(selected_boruta)} selected features.")
    
    xgb_model_boruta = train_xgboost(X_train[selected_boruta], y_train)
    y_pred_boruta = predict_xgboost(xgb_model_boruta, X_val[selected_boruta])
    boruta_accuracy = accuracy_score(y_val, y_pred_boruta)
    boruta_f1 = f1_score(y_val, y_pred_boruta)

    # Results Summary
    results = {
        'Method': [
            'No Feature Selection', 'Shapely Select', 'SHAP Selection', 
            'RFE', 'HISEL', 'Boruta'
        ],
        'Selected Features': [
            len(X_train.columns), len(selected_shapely), len(selected_shap_selection), 
            len(selected_rfe), len(selected_hisel), len(selected_boruta)
        ],
        'Accuracy': [
            no_fs_accuracy, shap_accuracy, shap_selection_accuracy, 
            rfe_accuracy, hisel_accuracy, boruta_accuracy
        ],
        'F1 Score': [
            no_fs_f1, shap_f1, shap_selection_f1, 
            rfe_f1, hisel_f1, boruta_f1
        ],
        'Runtime (s)': [
            no_fs_time, shap_time, shap_selection_time, 
            rfe_time, hisel_time, boruta_time
        ]
    }

    results_df = pd.DataFrame(results)
    print("\n--- Experiment Results ---")
    print(results_df)
    return results_df

# Run the experiments
run_experiments()


Loading dataset...

--- No Feature Selection ---
No Feature Selection completed in 0.67 seconds with 30 features.

--- Shapely Select with p-value=0.01 ---


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or swi

Shapely Select with p-value 0.01 completed in 41.28 seconds with 6 selected features: ['V14', 'V17', 'V4', 'V8', 'V10', 'V26']

--- Shapely Select with p-value=0.02 ---


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or swi

Shapely Select with p-value 0.02 completed in 29.96 seconds with 6 selected features: ['V14', 'V17', 'V4', 'V8', 'V10', 'V26']

--- Shapely Select with p-value=0.03 ---


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers
Try increasing solver accuracy or number of iterations, decreasing alpha, or swi

In [1]:
!pip3 install shap-select==0.1.0

Collecting shap-select==0.1.0
  Downloading shap-select-0.1.0.tar.gz (11 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: shap-select
  Building wheel for shap-select (pyproject.toml) ... [?25ldone
[?25h  Created wheel for shap-select: filename=shap_select-0.1.0-py3-none-any.whl size=10499 sha256=1d879e1b717c1c63c7c95bb7a605965618199e393a4291bc6f841172e9923bb4
  Stored in directory: /Users/baran.koseoglu/Library/Caches/pip/wheels/db/8e/89/b9036b114c0d8320ba07cd4296d266bf8f81fd8bd8b3d21d23
Successfully built shap-select
Installing collected packages: shap-select
Successfully installed shap-select-0.1.0


In [1]:
!pip3 install shap-selection


Collecting shap-selection
  Downloading shap_selection-0.1.6-py3-none-any.whl.metadata (2.7 kB)
Downloading shap_selection-0.1.6-py3-none-any.whl (4.1 kB)
Installing collected packages: shap-selection
Successfully installed shap-selection-0.1.6
