In [None]:
import os
import pandas as pd
import argparse
import glob
import time
import re  # To help extract version numbers from filenames
from sklearn.model_selection import train_test_split

# Generate a timestamp for this specific model run
timestamp = int(time.time())

# Get the current working directory
current_dir = os.getcwd()

# Define the output directory
output_dir = os.path.join(current_dir, 'Train_Result_RFE')

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Create a subdirectory for the current model run
run_output_dir = os.path.join(output_dir, f'Model_Run_{timestamp}')
os.makedirs(run_output_dir, exist_ok=True)

# Find CSV or Excel file in the current directory
data_file = None
for ext in ['csv', 'xlsx', 'xls']:
    files = glob.glob(os.path.join(current_dir, f'*.{ext}'))
    if files:
        data_file = files[0]  # Take the first file found
        break

# Raise an error if no data file is found
if not data_file:
    raise FileNotFoundError("No CSV or Excel file found in the current directory.")

# Read the data file
if data_file.endswith('.csv'):
    data = pd.read_csv(data_file)
elif data_file.endswith(('.xlsx', '.xls')):
    data = pd.read_excel(data_file)
else:
    raise ValueError("Unsupported file format. Only CSV or Excel files are supported.")

# Display basic information about the data
print(f"Data loaded successfully with shape: {data.shape}")
print(data.head())

# Handle missing values
if data.isnull().values.any():
    print("Warning: There are missing values in the dataset. Dropping missing values.")
    data = data.dropna()



# Split the entire DataFrame
cur_test_size = 0.3
random_state = 42  # Adjust as needed

# Stratify by 'group_1' to maintain label proportions
train_data, val_data = train_test_split(
    data, test_size=cur_test_size, stratify=data['group_1'], random_state=random_state
)

# Add 'Set' column to indicate the dataset
train_data = train_data.copy()
train_data['Set'] = 'Discovery'

val_data = val_data.copy()
val_data['Set'] = 'Validation'

# Rearrange columns so 'Set' and 'label' are first
cols = ['Set'] + [col for col in train_data.columns if col not in ['Set']]
train_data = train_data[cols]
val_data = val_data[cols]
# Move 'Set' column to the front
cols = ['Set'] + [col for col in train_data.columns if col != 'Set']
train_data = train_data[cols]
val_data = val_data[cols]

used_data =pd.concat([train_data,val_data])
used_data = used_data.sort_values(by='study_no')
# Define the file path
combined_file_path = os.path.join(run_output_dir, 'used_data.xlsx')

# Save to Excel
used_data.to_excel(combined_file_path, index=True)

print(f"Combined data saved to {combined_file_path}")

feature_columns = data.columns[3:]  # Adjust as needed
label_column = 'group_1'

# Extract features and labels for training set
X_train = train_data[feature_columns].astype(float)
y_train = train_data[label_column].astype(float)

# Extract features and labels for validation set
X_val = val_data[feature_columns].astype(float)
y_val = val_data[label_column].astype(float)

Data loaded successfully with shape: (476, 133)
   study_no   group  group_1  Abiotrophia  Acetatifactor  \
0  H1700097  Cancer        1          0.0            0.0   
1  H1700105  Cancer        1          0.0            0.0   
2  H1700084  Cancer        1          0.0            0.0   
3  H1700109  Cancer        1          0.0            0.0   
4  H1700098  Cancer        1          0.0            0.0   

   Acholeplasmataceae_uc  Actinobacillus  Actinobaculum  Actinomyces  \
0                    0.0             0.0            0.0          3.7   
1                    0.0             0.0            0.0          0.7   
2                    0.0             0.0            0.0          1.3   
3                    0.0             0.0            0.1          0.8   
4                    0.0             0.0            0.0          2.5   

   Aggregatibacter  ...  Xanthomonas  Cloacibacterium  Legionella  Variovorax  \
0              0.0  ...          0.0              0.0         0.0         0.0

In [2]:
train_data

Unnamed: 0,Set,study_no,group,group_1,Abiotrophia,Acetatifactor,Acholeplasmataceae_uc,Actinobacillus,Actinobaculum,Actinomyces,...,Xanthomonas,Cloacibacterium,Legionella,Variovorax,Bergeriella,Ethanoligenens,Lentimicrobiaceae_uc,Moraxella,Mycoplasma_g13,Streptobacillus
350,Discovery,H1900608,Control,0,0.00,0.0,0.0,0.00,0.02,1.30,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
64,Discovery,H1700032,Cancer,1,0.01,0.0,0.0,0.00,0.08,3.70,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
412,Discovery,H1900723,Control,0,0.00,0.0,0.0,0.11,0.00,1.01,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
166,Discovery,H1900166,Control,0,0.01,0.0,0.0,0.00,0.00,0.82,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
128,Discovery,H1900078,Control,0,0.00,0.0,0.0,0.00,0.02,1.39,...,0.0,0.0,0.0,0.0,0.03,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,Discovery,H1900354,Control,0,0.00,0.0,0.0,0.00,0.00,2.56,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
63,Discovery,H1700031,Cancer,1,0.00,0.0,0.0,0.00,0.00,3.86,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
267,Discovery,H1900400,Control,0,0.00,0.0,0.0,0.00,0.02,1.78,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
424,Discovery,H1900747,Control,0,0.00,0.0,0.0,0.00,0.33,2.49,...,0.0,0.0,0.0,0.0,0.00,0.01,0.0,0.0,0.0,0.0


### Check the multicolinearity

In [3]:
feature_columns

Index(['Abiotrophia', 'Acetatifactor', 'Acholeplasmataceae_uc',
       'Actinobacillus', 'Actinobaculum', 'Actinomyces', 'Aggregatibacter',
       'Akkermansia', 'Alistipes', 'Alloprevotella',
       ...
       'Xanthomonas', 'Cloacibacterium', 'Legionella', 'Variovorax',
       'Bergeriella', 'Ethanoligenens', 'Lentimicrobiaceae_uc', 'Moraxella',
       'Mycoplasma_g13', 'Streptobacillus'],
      dtype='object', length=130)

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
features = data[feature_columns]  # replace 'data' with your actual DataFrame if necessary

vif_data = pd.DataFrame()
vif_data['feature'] = features.columns
vif_data['VIF'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]

vif_data_mul = vif_data[vif_data['VIF']>10]
output_file = os.path.join(output_dir, "possible_multicollinearity_features.xlsx")

# Save the DataFrame to an Excel file
vif_data_mul.to_excel(output_file, index=False)

print(f"VIF data saved to {output_file}")


VIF data saved to /home/user/Desktop/PC_Analysis/Train_Result_RFE/possible_multicollinearity_features.xlsx


### Choose the Discovery, Validation Set size

### 모델 선택 및 모델 deploying

In [5]:
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from catboost import CatBoostClassifier
import lightgbm as lgb

# 사용성 증가를 위해 widget추가
import ipywidgets as widgets
from IPython.display import display


# Function to calculate F-Score
def f_score(y_test, preds, beta=1):
    tp = np.sum((preds == 1) & (y_test == 1))
    fp = np.sum((preds == 1) & (y_test == 0))
    fn = np.sum((preds == 0) & (y_test == 1))

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    fscore = (1 + beta ** 2) * (precision * recall) / ((beta ** 2 * precision) + recall + 1e-8)

    return fscore

# Argument parser definition for model parameters
parser = argparse.ArgumentParser(description='Train and evaluate a binary classification model.')
parser.add_argument('--model', type=str, default='xgb', choices=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
                    help='Choose a model: xgb (XGBoost), rf (Random Forest), catboost, gbm, lgbm (LightGBM).')
parser.add_argument('--random_state', type=int, default=42, help='Random state for reproducibility.')
parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees/estimators for the model.')
parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate for boosting models.')
parser.add_argument('--max_depth', type=int, default=6, help='Maximum depth for tree-based models.')
args = parser.parse_args([])

# Function to choose and run the model
def run_model(model_choice):
    # Create output directory for current model run
    run_output_dir = os.path.join('Train_Result_RFE', f'Model_Run_{model_choice}_{int(time.time())}')
    os.makedirs(run_output_dir, exist_ok=True)

    # Initialize model based on the user's choice
    if model_choice == 'xgb':
        model = xgb.XGBClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=args.max_depth, random_state=args.random_state,tree_method='gpu_hist')
    elif model_choice == 'rf':
        model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.random_state)
    elif model_choice == 'catboost':
        model = CatBoostClassifier(iterations=args.n_estimators, depth=args.max_depth, learning_rate=args.learning_rate, random_state=args.random_state, verbose=0)
    elif model_choice == 'gbm':
        model = GradientBoostingClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=args.max_depth, random_state=args.random_state)
    elif model_choice == 'lgbm':
        model = lgb.LGBMClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=args.max_depth, random_state=args.random_state)
    else:
        raise ValueError(f"Unsupported model type: {model_choice}")

    # Fit the model
    model.fit(X_train, y_train)

    # Predictions and evaluations
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f_score(y_val, y_pred)

    # Print results
    print(f"Model: {model_choice}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")

    # Save results
    result_path = os.path.join(run_output_dir, f'Results_{model_choice}.txt')
    with open(result_path, 'w') as f:
        f.write(f"Model: {model_choice}\n")
        f.write(f"Accuracy: {accuracy}\n")
        f.write(f"F1 Score: {f1}\n")

    print(f"Results saved to {result_path}")

    # Ensure 'study_no' column exists in data and assign Discovery/Validation
    train_indices = X_train.index
    val_indices = X_val.index

    if 'study_no' in data.columns:
        # Assign 'Discovery' to rows in data where study_no is in X_train
        data.loc[data.index.isin(train_indices), 'group_2'] = 'Discovery'

        # Assign 'Validation' to rows in data where study_no is in X_val
        data.loc[data.index.isin(val_indices), 'group_2'] = 'Validation'

        # Save the modified data to an Excel file
        modified_data_path = os.path.join(run_output_dir, 'Modified_Data_File.xlsx')
        data.to_excel(modified_data_path, index=False)
        print(f"Modified data file saved to {modified_data_path}")
    else:
        print("'study_no' column not found in data.")

    return model

# Model selection widget and execution button
model_selector = widgets.Dropdown(
    options=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
    value='xgb',
    description='Choose Model:',
)

run_button = widgets.Button(description="Run Model")

def on_button_click(b):
    run_model(model_selector.value)

run_button.on_click(on_button_click)

# Display widgets
display(model_selector)
display(run_button)

Dropdown(description='Choose Model:', options=('xgb', 'rf', 'catboost', 'gbm', 'lgbm'), value='xgb')

Button(description='Run Model', style=ButtonStyle())

RFE feature selection

In [6]:
from sklearn.feature_selection import RFE
import optuna

# FNR evaluation function
def FNR_eval(y_pred_proba, y_true):
    # Convert predicted probabilities to binary labels (threshold of 0.5)
    y_pred_label = (y_pred_proba >= 0.5).astype(int)
    
    # Compute the confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_label).ravel()
    
    # Calculate FNR (False Negative Rate)
    fnr = fn / (fn + tp)  # Correct formula for FNR is FN / (FN + TP)
    
    return fnr



# Create output directory for current model run
run_output_dir = os.path.join('Train_Result_RFE', f'Model_Run_{int(time.time())}')
os.makedirs(run_output_dir, exist_ok=True)


# Set up output widget to display results
output_widget = widgets.Output()


# Function to choose and run the model
def run_model(model_choice):
    # Initialize model based on the user's choice
    run_output_dir = os.path.join('Train_Result_RFE', f'Model_Run_{model_choice}_{int(time.time())}')
    os.makedirs(run_output_dir, exist_ok=True)
    # Initialize model based on the user's choice
    if model_choice == 'xgb':
        model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, tree_method = 'hist',device = "cuda")
    elif model_choice == 'rf':
        model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
    elif model_choice == 'catboost':
        model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_state=42, verbose=0)
    elif model_choice == 'gbm':
        model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
    elif model_choice == 'lgbm':
        model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
    else:
        raise ValueError(f"Unsupported model type: {model_choice}")

    # Apply RFE to the model to select top 10 features
    rfe = RFE(model, n_features_to_select=20)
    rfe.fit(X_train, y_train)

    # Predictions and evaluations
    train_pred = rfe.predict(X_train)
    val_pred = rfe.predict(X_val)
    y_pred_proba = rfe.predict_proba(X_val)[:, 1]  # Get predicted probabilities for class 1
    accuracy = accuracy_score(y_val, val_pred)
    fnr = FNR_eval(y_pred_proba, y_val)
    print(f"Model: {model_choice}")
    print(f"Accuracy: {accuracy}")

    # Define the result directory and result file paths
    result_file = os.path.join(run_output_dir, f'Results_{model_choice}.txt')

    # Save results to text file
    with open(result_file, 'w') as f:
        f.write(f"Model: {model_choice}\n")
        f.write(f"Accuracy: {accuracy}\n")
        f.write(f"FNR: {fnr}\n")

    # 혼동 행렬 계산
    conf_matrix = confusion_matrix(y_val, val_pred)

    # 혼동 행렬 시각화
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Control', 'Cancer'], yticklabels=['Control', 'Cancer'], annot_kws={"size": 14})
    plt.title('Confusion Matrix on Validation Set ( Split)', fontsize=15)
    plt.xlabel('Predicted Label', fontsize=13)
    plt.ylabel('True Label', fontsize=13)
    plt.tight_layout()
    plt.savefig(os.path.join(run_output_dir, f'confusion_matrix.png'))
    plt.close()



    # Save feature importance and evaluation
    important_features = X_train.columns[rfe.support_]
    print(f"Top Features Selected by RFE: {important_features}")

    # Save feature importance
    importance_df = pd.DataFrame({
        'Feature': important_features,
        'Importance': rfe.estimator_.feature_importances_
    })
    top_featuers = importance_df.sort_values(by='Importance', ascending=False).head(20)
    excel_file = os.path.join(run_output_dir, f'feature_importances_{model_choice}.xlsx')
    importance_df.to_excel(excel_file, index=False)
    # Plot feature importance
    colors = cm.rainbow(np.linspace(0, 1, len(top_featuers)))
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=top_featuers, palette=colors)
    plt.title('Top 20 Important Features Selected by RFE')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig(os.path.join(run_output_dir, 'feature_importance.png'))  # Save to result_dir
    plt.close()


    # SHAP explanation (on the selected features)
    explainer = shap.TreeExplainer(rfe.estimator_)
    shap_values = explainer.shap_values(X_val[important_features])
    if model_choice == 'rf':
        # For RandomForest, shap_values has shape (n_samples, n_features, n_classes)
        # We need to select shap_values for the positive class (e.g., class 1 for binary classification)
        # SHAP summary plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[:,:,1], X_val[important_features], plot_type='dot', show=False)
        plt.tight_layout()
        plt.savefig(os.path.join(run_output_dir, 'shap_feature_importance.png'))  # Save to result_dir
        plt.close()
    else:
        # For other models (like XGB, LGBM, etc.), we can use shap_values directly
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_val[important_features], show=False)
        plt.tight_layout()
        plt.savefig(os.path.join(run_output_dir, 'shap_feature_importance.png'))  # Save to result_dir
        plt.close()

    
    # ROC curve and AUC score
    y_pred_proba = rfe.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    auc_score = roc_auc_score(y_val, y_pred_proba)

    # ROC curve plot
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16)
    plt.legend(loc='lower right', fontsize=12)
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(run_output_dir, 'roc_curve.png'))  # Save to result_dir
    plt.close()

    print(f"All results are saved in {run_output_dir}")

    return model

model_selector = widgets.Dropdown(
    options=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
    value='xgb',
    description='Choose Model:',
)

run_button = widgets.Button(description="Run Model")
# Label to indicate model status
status_label = widgets.Label(value='')


def on_button_click(b):
    # Update status to "Running..."
    status_label.value = "Running..."
    run_model(model_selector.value)
    # After running, update status to "Done!"
    status_label.value = "Done!"

run_button.on_click(on_button_click)

# Display widgets
display(model_selector)
display(run_button)
display(status_label)


Dropdown(description='Choose Model:', options=('xgb', 'rf', 'catboost', 'gbm', 'lgbm'), value='xgb')

Button(description='Run Model', style=ButtonStyle())

Label(value='')

### RFE with bootstraping

In [7]:
# !pip uninstall xgboost -y
# !pip install xgboost --upgrade --no-cache-dir

In [8]:
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
# 사용성 증가를 위해 widget추가
import ipywidgets as widgets
from IPython.display import display
from sklearn.utils import resample  # Import resample for bootstrap
from sklearn.feature_selection import RFE
# Aggregate selected features
from collections import Counter
from sklearn.base import clone

def run_model(model_choice, n_bootstraps=100):

    # Initialize model based on the user's choice
    if model_choice == 'xgb':
        model_init = xgb.XGBClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, tree_method='hist',device='cuda'
        )
    elif model_choice == 'rf':
        model_init = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
    elif model_choice == 'catboost':
        model_init = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_state=42, verbose=0)
    elif model_choice == 'gbm':
        model_init = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
    elif model_choice == 'lgbm':
        model_init = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
    else:
        raise ValueError(f"Unsupported model type: {model_choice}")

    # Initialize lists to store performance metrics
    accuracies = []
    auc_scores = []
    selected_features_list = []
    feature_importances_dict = {}


    for i in range(n_bootstraps):
        # Resample the training data with replacement
        X_train_bootstrap, y_train_bootstrap = resample(X_train, y_train, replace=True)

        # Initialize a new model instance for each bootstrap
        model = clone(model_init)

        # Apply RFE on the bootstrapped data
        rfe = RFE(model, n_features_to_select=20)
        rfe.fit(X_train_bootstrap, y_train_bootstrap)

        # Get the selected features for this bootstrap
        selected_features = X_train.columns[rfe.support_]
        selected_features_list.append(selected_features)

        # Reduce the bootstrapped data to only the selected features
        X_train_bootstrap_selected = X_train_bootstrap[selected_features]
        X_val_selected = X_val[selected_features]  # Adjust validation set accordingly

        # Fit the model on the bootstrap sample with selected features
        model.fit(X_train_bootstrap_selected, y_train_bootstrap)

        # Predictions and evaluations on validation set
        y_pred = model.predict(X_val_selected)
        accuracies.append(accuracy_score(y_val, y_pred))

        y_pred_proba = model.predict_proba(X_val_selected)[:, 1]
        auc_scores.append(roc_auc_score(y_val, y_pred_proba))

        # Collect feature importances
        for feature, importance in zip(selected_features, model.feature_importances_):
            if feature in feature_importances_dict:
                feature_importances_dict[feature].append(importance)
            else:
                feature_importances_dict[feature] = [importance]

    # Aggregate selected features
    feature_frequency = Counter([feat for sublist in selected_features_list for feat in sublist])
    # Determine features selected most frequently
    most_common_features = feature_frequency.most_common()
    # Convert the list of tuples into a DataFrame
    feature_counts_df = pd.DataFrame(most_common_features, columns=['Feature', 'Count'])
    # Sort by 'Count' in descending order
    feature_counts_df = feature_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    # Save feature counts to Excel
    result_dir = 'results'  # Define the result directory
    os.makedirs(result_dir, exist_ok=True)  # Ensure the directory exists
    output_excel_file = os.path.join(result_dir, 'feature_selection_counts.xlsx')
    feature_counts_df.to_excel(output_excel_file, index=False)
    print(f"Feature selection counts saved to {output_excel_file}")

    # Calculate mean and confidence intervals
    accuracy_mean = np.mean(accuracies)
    accuracy_ci = np.percentile(accuracies, [2.5, 97.5])

    auc_mean = np.mean(auc_scores)
    auc_ci = np.percentile(auc_scores, [2.5, 97.5])

    # Calculate mean feature importances
    mean_feature_importances = {feature: np.mean(importances) for feature, importances in feature_importances_dict.items()}
    importance_df = pd.DataFrame(list(mean_feature_importances.items()), columns=['Feature', 'Importance'])
    # Sort by 'Importance' in descending order
    importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

    # Print results
    print(f"Model: {model_choice}")
    print(f"Accuracy: {accuracy_mean:.4f} (95% CI: {accuracy_ci[0]:.4f}, {accuracy_ci[1]:.4f})")
    print(f"AUC: {auc_mean:.4f} (95% CI: {auc_ci[0]:.4f}, {auc_ci[1]:.4f})")

    result_file = os.path.join(result_dir, f'Results_{model_choice}.txt')

    # Save results to text file
    with open(result_file, 'w') as f:
        f.write(f"Model: {model_choice}\n")
        f.write(f"Accuracy: {accuracy_mean:.4f} (95% CI: {accuracy_ci[0]:.4f}, {accuracy_ci[1]:.4f})\n")
        f.write(f"AUC: {auc_mean:.4f} (95% CI: {auc_ci[0]:.4f}, {auc_ci[1]:.4f})\n")

    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df_sorted.head(20))
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, 'feature_importance.png'))
    plt.close()

    # Retrain model on the entire training data for final evaluation
    # Use the top 20 features based on importance
    top_features = importance_df_sorted['Feature'].tolist()[:20]
    X_train_selected = X_train[top_features]
    X_val_selected = X_val[top_features]
    model = clone(model_init)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)

    # Compute confusion matrix
    conf_matrix = confusion_matrix(y_val, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Control', 'Cancer'],  # Adjust labels as needed
        yticklabels=['Control', 'Cancer'],
        annot_kws={"size": 14}
    )
    plt.title('Confusion Matrix on Validation Set', fontsize=15)
    plt.xlabel('Predicted Label', fontsize=13)
    plt.ylabel('True Label', fontsize=13)
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, 'confusion_matrix.png'))
    plt.close()

    # ROC curve and AUC score for the final model
    y_pred_proba = model.predict_proba(X_val_selected)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    auc_score = roc_auc_score(y_val, y_pred_proba)

    # ROC curve plot
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16)
    plt.legend(loc='lower right', fontsize=12)
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, 'roc_curve.png'))
    plt.close()

    # SHAP explanation (on the selected features)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_val_selected)

    # SHAP summary plot
    shap.summary_plot(shap_values, X_val_selected, show=False)
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, 'shap_feature_importance.png'))
    plt.close()

    print(f"All results are saved in {result_dir}")

    return model


import time
import ipywidgets as widgets
from IPython.display import display

# Number of bootstrap samples widget
bootstrap_selector = widgets.IntText(
    value=100,
    description='Bootstraps:',
    disabled=False
)

# Model selection widget
model_selector = widgets.Dropdown(
    options=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
    value='xgb',
    description='Choose Model:',
)

run_button = widgets.Button(description="Run Model")

# Label to indicate model status
status_label = widgets.Label(value='')


def on_button_click(b):
    # Update status to "Running..."
    status_label.value = "Running..."
    result = run_model(model_selector.value, n_bootstraps=bootstrap_selector.value)
    # After running, update status to "Done!"
    status_label.value = "Done!"
    
run_button.on_click(on_button_click)

# Display widgets
display(model_selector)
display(bootstrap_selector)
display(run_button)
display(status_label)


Dropdown(description='Choose Model:', options=('xgb', 'rf', 'catboost', 'gbm', 'lgbm'), value='xgb')

IntText(value=100, description='Bootstraps:')

Button(description='Run Model', style=ButtonStyle())

Label(value='')

### comparing discovery set and validation set

In [10]:
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap
import os
import matplotlib.cm as cm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
from sklearn.feature_selection import RFE
from eli5.sklearn import PermutationImportance
from collections import Counter
import os
path = os.getcwd()
data = pd.read_excel(path+"/PC_normalized_PC92_HC384_130 genus_ML.xlsx")
data.set_index('study_no',inplace=True)
label_column = 'group_1'

# Define feature columns (adjust the column indices as needed)
feature_columns = data.columns[3:]  # Assuming features start from the 4th column

# Extract features and labels
features = data[feature_columns]
label = data[label_column]


# Convert features and label to appropriate data types
features = features.astype(float)
label = label.astype(int)  # Or float, depending on your label encoding


# FNR evaluation function
def FNR_eval(y_pred_proba, y_true):
    y_pred_label = (y_pred_proba >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_label).ravel()
    return fn / (fn + tp)  # FN / (FN + TP)

# Argument parser
parser = argparse.ArgumentParser(description='Train and evaluate a binary classification model.')
parser.add_argument('--model', type=str, default='xgb', choices=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
                    help='Choose a model: xgb (XGBoost), rf (Random Forest), catboost, gbm, lgbm (LightGBM).')
parser.add_argument('--random_state', type=int, default=42, help='Random state for reproducibility.')
parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees/estimators for the model.')
parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate for boosting models.')
parser.add_argument('--max_depth', type=int, default=6, help='Maximum depth for tree-based models.')
args = parser.parse_args([])

# Initialize model
def initialize_model(model_choice,args):
    if model_choice == 'xgb':
        return xgb.XGBClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, 
                                 max_depth=args.max_depth, random_state=args.random_state,tree_method ='hist', device = 'cuda')
    elif model_choice == 'rf':
        return RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth, 
                                      random_state=args.random_state,n_jobs=-1)
    elif model_choice == 'catboost':
        return CatBoostClassifier(iterations=args.n_estimators, depth=args.max_depth, 
                                  learning_rate=args.learning_rate, random_state=args.random_state, 
                                  verbose=0,task_type='GPU')
    elif model_choice == 'gbm':
        return GradientBoostingClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, 
                                          max_depth=args.max_depth, random_state=args.random_state)
    elif model_choice == 'lgbm':
        return lgb.LGBMClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, 
                                  max_depth=args.max_depth, random_state=args.random_state, device='gpu')
    else:
        raise ValueError(f"Unsupported model type: {model_choice}")
    

def create_output_dir(model_choice):
    # Use model's short name, not the full representation
    timestamp = time.strftime('%Y%m%d-%H%M%S')  # Short timestamp
    dir_name = f"{model_choice}_{timestamp}"  # Simple name based on model type and timestamp
    run_output_dir = os.path.join('Train_Result_RFE', dir_name)
    os.makedirs(run_output_dir, exist_ok=True)
    return run_output_dir

# RFE Feature Selection
def rfe_feature_selection(model, X, y, n_features):
    rfe = RFE(model, n_features_to_select=n_features)
    rfe.fit(X, y)
    selected_features_mask = rfe.support_  # Boolean mask of selected features
    return X.columns[selected_features_mask], rfe

def refit_model(model, X_selected, y):
    model.fit(X_selected, y)  # Just fit the passed-in model
    return model  # Return the refitted model

# Model evaluation (accuracy, FNR, confusion matrix)
def evaluate_model(model_refit, X_val_selected, y_val, model_choice, run_output_dir, fold_number):
    val_pred = model_refit.predict(X_val_selected)
    y_pred_proba = model_refit.predict_proba(X_val_selected)[:, 1]
    accuracy = accuracy_score(y_val, val_pred)
    fnr = FNR_eval(y_pred_proba, y_val)
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_val, val_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Control', 'Cancer'], 
                yticklabels=['Control', 'Cancer'], annot_kws={"size": 14})
    plt.title('Confusion Matrix on Validation Set', fontsize=15)
    plt.xlabel('Predicted Label', fontsize=13)
    plt.ylabel('True Label', fontsize=13)
    plt.tight_layout()

    plot_file = os.path.join(run_output_dir, f'confusion_matrix_fold_{fold_number}.png')
    plt.savefig(plot_file)
    plt.close()
    print(f"Saved confusion matrix for fold {fold_number} at {plot_file}")

    return accuracy, fnr

# Permutation importance
def compute_permutation_importance(model_refit, X_selected, y, random_state):
    perm_importance = PermutationImportance(model_refit, random_state=random_state).fit(X_selected, y)
    return perm_importance.feature_importances_

# Function to save permutation importance
def save_permutation_importance(importances, features, run_output_dir, model_choice, mode):
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    # Save to CSV or Excel (adjust according to your preference)
    importance_file = os.path.join(run_output_dir, f'permutation_importance_{mode}_{model_choice}.csv')
    importance_df.to_csv(importance_file, index=False)
    print(f"Permutation importance saved to {importance_file}")
          
# SHAP values and plots
def plot_shap_values(model_refit, X_selected, run_output_dir, fold_number, dataset_type, model_choice):
    explainer = shap.TreeExplainer(model_refit)
    shap_values = explainer.shap_values(X_selected)
    
    if model_choice == 'rf':
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[:, :, 1], X_selected, plot_type='dot', show=False)
    else:
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_selected, show=False)

    plt.tight_layout()
    plt.savefig(os.path.join(run_output_dir, f'shap_feature_importance_{fold_number}.png'))
    plt.close()

from sklearn.metrics import roc_curve, auc
# Plot ROC curve across cross-validation folds
def plot_cv_roc_curves(model, X, y, skf, model_choice, run_output_dir):
    plt.figure(figsize=(10,8))
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Fit the model
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate ROC and AUC for this fold
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        
        # Interpolate TPR for mean ROC curve
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        aucs.append(roc_auc)

        # Plot each fold's ROC curve
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label=f'Fold {i+1} AUC: {roc_auc:.4f}')

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    
    plt.plot(mean_fpr, mean_tpr, color='orange', lw=2, alpha=0.8,
             label=r'Mean ROC (AUC = %0.4f ± %0.4f)' % (mean_auc, std_auc))
    # Plot standard deviation (shading)
    std_tpr = np.std(tprs, axis=0)
    tpr_upper = np.minimum(mean_tpr + std_tpr, 1)
    tpr_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tpr_lower, tpr_upper, color='grey', alpha=0.2,
                     label=r'± 1 std. dev.')

    # Plot settings
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Cross-Validation ROC Curves - {model_choice}')
    plt.legend(loc="lower right")
    plt.tight_layout()

    # Save plot
    plot_file = os.path.join(run_output_dir, 'cv_roc_curves.png')
    plt.savefig(plot_file)
    plt.close()
    print(f"Saved ROC curves plot at {plot_file}")


def plot_feature_importance(importances, feature_names, dataset_type, fold_number, run_output_dir):
    # Create a DataFrame for plotting
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 6))

    cmap = sns.color_palette("rainbow", as_cmap = True)
    colors = [cmap(i) for i in np.linspace(0, 1, len(importance_df))]
    
    sns.barplot(x='Importance', y='Feature', data=importance_df, palette=colors, hue='Feature', dodge=False, legend=False)
    plt.title(f'Feature Importances ({dataset_type.capitalize()} - Fold {fold_number})')
    plt.tight_layout()

    # Save the plot
    plot_file = os.path.join(run_output_dir, f'feature_importance_{dataset_type}_fold_{fold_number}.png')
    plt.savefig(plot_file)
    plt.close()
    print(f"Saved feature importance plot for {dataset_type} at {plot_file}")



def run_model(model_choice,n_splits = 5):
    """
    Runs the machine learning model with cross-validation.

    Parameters:
    - model_choice (str): The chosen model type.
    - features (DataFrame): The feature set.
    - label (Series): The target variable.
    - n_splits (int): Number of cross-validation splits.

    Returns:
    None
    """
        
    # Output directory using model choice, not full model object
    run_output_dir = create_output_dir(model_choice)

    # Stratified K-Fold to preserve the class balance
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=args.random_state)
    

    accuracy_list = []
    fnr_list = []
    train_importances_list = []
    val_importances_list = []
    common_features_list = []

    # Initialize a Counter to keep track of feature selections
    feature_counter = Counter()
    # Initialize Counters to keep track of important features
    train_feature_importance_counter = Counter()
    val_feature_importance_counter = Counter()
   
    for fold, (train_idx, val_idx) in enumerate(skf.split(features, label)):
        print(f"Running fold {fold+1}/{n_splits}")

        # Data for current fold
        X_train, X_val = features.iloc[train_idx], features.iloc[val_idx]
        y_train, y_val = label.iloc[train_idx], label.iloc[val_idx]

        # Merge X_train and y_train, and X_val and y_val, and save them for inspection
        train_data = X_train.copy()
        train_data['label'] = y_train
        train_data['set'] = 'Discovery'

        val_data = X_val.copy()
        val_data['label'] = y_val
        val_data['set'] = 'Validation'

        # Combine train and validation data
        combined_data = pd.concat([train_data, val_data])

        # Save combined data to an Excel file for this fold
        combined_file_path = os.path.join(run_output_dir, f'combined_train_val_fold_{fold+1}.xlsx')
        combined_data.to_excel(combined_file_path, index=True)

        print(f"Saved combined train/validation set for fold {fold+1} at {combined_file_path}")
        
        # Step 1: Initialize model for training
        model = initialize_model(model_choice,args=args)

        # Step 2: RFE for training set
        selected_features, rfe_all = rfe_feature_selection(model, X_train, y_train, n_features=20)
        X_train_selected = X_train[selected_features]


        # Update feature_counter with the selected features from this fold
        feature_counter.update(selected_features)


        # Convert the counter to a DataFrame
        feature_counts_df = pd.DataFrame.from_dict(feature_counter, orient='index', columns=['count'])
        feature_counts_df = feature_counts_df.sort_values(by='count', ascending=False)

        # Save to Excel or CSV
        feature_counts_file = os.path.join(run_output_dir, 'feature_selection_counts.xlsx')
        feature_counts_df.to_excel(feature_counts_file)
        print(f"Saved feature selection counts at {feature_counts_file}")

        # Train model on training set with selected features
        model_refit_train = refit_model(model, X_train_selected, y_train)

        # Apply the same selected features to validation set
        X_val_selected = X_val[selected_features]

        # Step 4: Evaluate model on validation set
        accuracy, fnr = evaluate_model(model_refit_train, X_val_selected, y_val, model_choice, run_output_dir, fold_number=fold+1)

        # Append accuracy and FNR for this fold
        accuracy_list.append(accuracy)
        fnr_list.append(fnr)

        # Step 5: Permutation importance for both datasets using the trained model
        train_importances = compute_permutation_importance(model_refit_train, X_train_selected, y_train, args.random_state)
        val_importances = compute_permutation_importance(model_refit_train, X_val_selected, y_val, args.random_state)

        # Store feature importances for comparison
        train_importances_list.append(train_importances)
        val_importances_list.append(val_importances)

        # Save permutation importance results for both training and validation sets
        save_permutation_importance(train_importances, X_train_selected.columns, run_output_dir, model_choice, f"train_fold_{fold+1}")
        save_permutation_importance(val_importances, X_val_selected.columns, run_output_dir, model_choice, f"val_fold_{fold+1}")

        plot_feature_importance(train_importances,X_train_selected.columns, dataset_type='train',fold_number ={fold},run_output_dir=run_output_dir)
        plot_feature_importance(val_importances, X_val_selected.columns, dataset_type='valid',fold_number = {fold},run_output_dir=run_output_dir)
        
        # After computing permutation importances
        # For training data
        # For training data
        plot_shap_values(model_refit_train, X_train_selected, run_output_dir, fold+1, 'train', model_choice)

        # For validation data
        plot_shap_values(model_refit_train, X_val_selected, run_output_dir, fold+1, 'validation', model_choice)


        # Step 6: Find common features between training and validation sets
        train_important_features = X_train_selected.columns[train_importances > 0]
        train_feature_importance_counter.update(train_important_features)

        # For validation data
        val_important_features = X_val_selected.columns[val_importances > 0]
        val_feature_importance_counter.update(val_important_features)

        # [Optional] Save important features for this fold
        # Save training important features
        train_features_file = os.path.join(run_output_dir, f'train_important_features_fold_{fold+1}.txt')
        with open(train_features_file, 'w') as f:
            for feature in train_important_features:
                f.write(f"{feature}\n")
        print(f"Saved training important features for fold {fold+1} at {train_features_file}")

        # Save validation important features
        val_features_file = os.path.join(run_output_dir, f'val_important_features_fold_{fold+1}.txt')
        with open(val_features_file, 'w') as f:
            for feature in val_important_features:
                f.write(f"{feature}\n")
        print(f"Saved validation important features for fold {fold+1} at {val_features_file}")

        # Intersection of selected features in train and validation
        common_features = train_important_features.intersection(val_important_features)
        common_features_list.append(common_features)

        # Save the common features for this fold
        common_features_file = os.path.join(run_output_dir, f'common_features_fold_{fold+1}.txt')
        with open(common_features_file, 'w') as f:
            for feature in common_features:
                f.write(f"{feature}\n")
        print(f"Saved common features for fold {fold+1} at {common_features_file}")

        

    # After the CV loop, process the feature importance counters

    # Convert the counters to DataFrames
    train_feature_counts_df = pd.DataFrame.from_dict(train_feature_importance_counter, orient='index', columns=['count'])
    train_feature_counts_df = train_feature_counts_df.sort_values(by='count', ascending=False)

    val_feature_counts_df = pd.DataFrame.from_dict(val_feature_importance_counter, orient='index', columns=['count'])
    val_feature_counts_df = val_feature_counts_df.sort_values(by='count', ascending=False)

    # Save the feature counts to Excel files
    train_feature_counts_file = os.path.join(run_output_dir, 'train_feature_importance_counts.xlsx')
    train_feature_counts_df.to_excel(train_feature_counts_file)
    print(f"Saved training feature importance counts at {train_feature_counts_file}")

    val_feature_counts_file = os.path.join(run_output_dir, 'val_feature_importance_counts.xlsx')
    val_feature_counts_df.to_excel(val_feature_counts_file)
    print(f"Saved validation feature importance counts at {val_feature_counts_file}")

    # [Optional] Save common features across all folds
    # Flatten the list of common features and count them
    all_common_features = [feature for fold_features in common_features_list for feature in fold_features]
    common_feature_counter = Counter(all_common_features)
    common_features_df = pd.DataFrame.from_dict(common_feature_counter, orient='index', columns=['count'])
    common_features_df = common_features_df.sort_values(by='count', ascending=False)

    common_features_file = os.path.join(run_output_dir, 'common_feature_counts.xlsx')
    common_features_df.to_excel(common_features_file)
    print(f"Saved common feature counts across all folds at {common_features_file}")

    # After all folds, compute mean performance metrics
    mean_accuracy = np.mean(accuracy_list)
    mean_fnr = np.mean(fnr_list)

    # Print cross-validation results
    print(f"\nCross-validation results for {model_choice}:")
    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Mean FNR: {mean_fnr:.4f}")

    result_file = os.path.join(run_output_dir, f'CV_Results_{model_choice}.txt')
    with open(result_file, 'w') as f:
        f.write(f"Model: {model_choice}\n")
        f.write(f"Mean Accuracy: {mean_accuracy:.4f}\n")
        f.write(f"Mean FNR: {mean_fnr:.4f}\n")
    print(f"All cross-validation results saved in {run_output_dir}")


    # Plot ROC curves
    plot_cv_roc_curves(model, features, label, skf, model_choice, run_output_dir)
    print("Model training and evaluation completed.")

# Widgets for model selection
model_selector = widgets.Dropdown(
    options=['xgb', 'rf', 'catboost', 'gbm', 'lgbm'],
    value='xgb',
    description='Choose Model:',
)

run_button = widgets.Button(description="Run Model")
status_label = widgets.Label(value='')

def on_button_click(b):
    status_label.value = "Running..."
    run_model(model_selector.value,n_splits=5)  # Pass model type (e.g., 'xgb') instead of initializing the model
    status_label.value = "Done!"

run_button.on_click(on_button_click)

# Display widgets
display(model_selector)
display(run_button)
display(status_label)


Dropdown(description='Choose Model:', options=('xgb', 'rf', 'catboost', 'gbm', 'lgbm'), value='xgb')

Button(description='Run Model', style=ButtonStyle())

Label(value='')

Running fold 1/5
Saved combined train/validation set for fold 1 at Train_Result_RFE/xgb_20241107-143049/combined_train_val_fold_1.xlsx
Saved feature selection counts at Train_Result_RFE/xgb_20241107-143049/feature_selection_counts.xlsx
Saved confusion matrix for fold 1 at Train_Result_RFE/xgb_20241107-143049/confusion_matrix_fold_1.png
Permutation importance saved to Train_Result_RFE/xgb_20241107-143049/permutation_importance_train_fold_1_xgb.csv
Permutation importance saved to Train_Result_RFE/xgb_20241107-143049/permutation_importance_val_fold_1_xgb.csv
Saved feature importance plot for train at Train_Result_RFE/xgb_20241107-143049/feature_importance_train_fold_{0}.png
Saved feature importance plot for valid at Train_Result_RFE/xgb_20241107-143049/feature_importance_valid_fold_{0}.png
Saved training important features for fold 1 at Train_Result_RFE/xgb_20241107-143049/train_important_features_fold_1.txt
Saved validation important features for fold 1 at Train_Result_RFE/xgb_20241107-1