## Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import  RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## ML Model Results Storage Framework

In [2]:
# =============================================================================
# ML MODEL RESULTS STORAGE FRAMEWORK
# =============================================================================

# Creating holders to store the model performance results
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []  # Adding a holder for AUC-ROC

# Function to call for storing the results
def storeResults(model, config, a, b, c, d, e):
    """
    Store model performance results
    
    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Function to display and save results
def displayAndSaveResults(filename_prefix='model_results'):
    """
    Create dataframe from results, display, and save to CSV
    
    Parameters:
    filename_prefix: Prefix for the CSV filenames
    """
    # Creating the dataframe
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
        'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
        'Recall': [f"{rec * 100:.3f}%" for rec in recall],
        'Precision': [f"{prec * 100:.3f}%" for prec in precision],
        'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
    })
    
    # Remove duplicates if any
    result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)
    
    print("\n" + "="*100)
    print("MODEL PERFORMANCE RESULTS")
    print("="*100)
    print(result.to_string(index=False))
    
    # Saving the result to a CSV file
    result.to_csv(f'{filename_prefix}.csv', index=False)
    print(f"\nResults saved to {filename_prefix}.csv")
    
    # Sorting the dataframe on accuracy and F1 Score
    sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*100)
    print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
    print("="*100)
    print(sorted_result.to_string(index=False))
    
    # Saving the sorted result to a CSV file
    sorted_result.to_csv(f'sorted_{filename_prefix}.csv', index=False)
    print(f"\nSorted results saved to sorted_{filename_prefix}.csv")
    
    return result, sorted_result

# Function to clear results (useful when running multiple experiments)
def clearResults():
    """Clear all stored results"""
    global ML_Model, ML_Config, accuracy, f1_score, recall, precision, auc_roc
    ML_Model.clear()
    ML_Config.clear()
    accuracy.clear()
    f1_score.clear()
    recall.clear()
    precision.clear()
    auc_roc.clear()
    print("Results cleared!")

# Function to plot model comparison
def plotModelComparison(result_df):
    """
    Create visualization comparing model performances
    
    Parameters:
    result_df: DataFrame with model results
    """
    # Convert percentage strings back to floats for plotting
    metrics_cols = ['Accuracy', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
    plot_df = result_df.copy()
    
    for col in metrics_cols:
        plot_df[col] = plot_df[col].str.rstrip('%').astype(float)
    
    # Create subplot for each metric
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.ravel()
    
    for idx, metric in enumerate(metrics_cols):
        # Group by model and get mean performance across configurations
        model_performance = plot_df.groupby('ML Model')[metric].mean().sort_values(ascending=False)
        
        # Create bar plot
        ax = axes[idx]
        bars = ax.bar(range(len(model_performance)), model_performance.values, 
                      color=plt.cm.Blues(np.linspace(0.4, 0.9, len(model_performance))))
        ax.set_xticks(range(len(model_performance)))
        ax.set_xticklabels(model_performance.index, rotation=45, ha='right')
        ax.set_ylabel(f'{metric} (%)')
        ax.set_title(f'Average {metric} by Model', fontweight='bold')
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.1f}%', ha='center', va='bottom')
    
    # Hide the last subplot if we have 5 metrics
    if len(metrics_cols) == 5:
        axes[5].set_visible(False)
    
    plt.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

print("Model results storage framework loaded successfully!")
print("Available functions:")
print("- storeResults(model, config, accuracy, f1, recall, precision, auc_roc)")
print("- displayAndSaveResults(filename_prefix='model_results')")
print("- clearResults()")
print("- plotModelComparison(result_df)")

Model results storage framework loaded successfully!
Available functions:
- storeResults(model, config, accuracy, f1, recall, precision, auc_roc)
- displayAndSaveResults(filename_prefix='model_results')
- clearResults()
- plotModelComparison(result_df)


In [3]:
# Load the dataset
df = pd.read_csv('data/StressLevelDataset.csv')

# Load your dataset
X = df.drop(columns=["stress_level"])
y = df["stress_level"]

# Step 1: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

---

# SVM

### SVM with PCA 99

In [4]:
# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(SVC(kernel='linear'), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with SVM ===")
svm_estimator = SVC(kernel='linear')

rfecv = RFECV(estimator=svm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance*100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: SVM + GridSearchCV
print("\n=== SVM Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'C': [100],
    'gamma': ['auto'],
    'kernel': ['sigmoid'],
    'degree': [2],
    'coef0': [0.0]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning SVM with {name} configuration...")
    svc = GridSearchCV(SVC(probability=True), param_grid, cv=10, n_jobs=-1, verbose=2)
    svc.fit(X_train_cfg, y_train_cfg)

    y_train_svc = svc.predict(X_train_cfg)
    y_test_svc = svc.predict(X_test_cfg)
    y_train_svc_proba = svc.predict_proba(X_train_cfg)
    y_test_svc_proba = svc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_svc),
            metrics.accuracy_score(y_test, y_test_svc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.f1_score(y_test, y_test_svc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.recall_score(y_test, y_test_svc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.precision_score(y_test, y_test_svc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_svc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nSupport Vector Machine Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Support Vector Machine 99',
        name,
        metrics.accuracy_score(y_test, y_test_svc),
        metrics.f1_score(y_test, y_test_svc, average='macro'),
        metrics.recall_score(y_test, y_test_svc, average='macro'),
        metrics.precision_score(y_test, y_test_svc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(svc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 19

=== RFECV Feature Selection with SVM ===
Optimal number of features selected by RFECV: 19

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 18

=== SVM Model Performance with Hyperparameter Tuning ===

Running SVM with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Support Vector Machine Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.340606  0.169379 0.333333   0.113535 0.520273
    Test  0.320000  0.161616 0.333333   0.106667 0.532895
Best hyperparameters found by GridSearchCV:
{'C': 100, 'coef0': 0.0, 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid'}

Running SVM with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Support Vector Machine Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precisi

---

# Random Forest

### Random Forest with PCA 99

In [5]:
# Written by Ovi, 2025-07-07, Random Forest classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100],
    'max_depth': [120],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
        'Random Forest 99',
        name,
        metrics.accuracy_score(y_test, y_test_rf),
        metrics.f1_score(y_test, y_test_rf, average='macro'),
        metrics.recall_score(y_test, y_test_rf, average='macro'),
        metrics.precision_score(y_test, y_test_rf, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 3

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 3

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 3

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Random Forest Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000    1.0000 1.000000   1.000000 1.000000
    Test  0.887273    0.8874 0.887293   0.887987 0.986663
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 120, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Running Random Forest with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totallin

---

# Gradient Boosting

### Gradient Boosting with PCA 90

In [6]:
# Written by Ovi, 2025-07-07, Gradient Boosting classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(GradientBoostingClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Gradient Boosting ===")
gbc_estimator = GradientBoostingClassifier()

rfecv = RFECV(estimator=gbc_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gbc_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.05],
    'n_estimators': [200],
    'max_depth': [3],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gbc = gbc.predict(X_train_cfg)
    y_test_gbc = gbc.predict(X_test_cfg)
    y_train_gbc_proba = gbc.predict_proba(X_train_cfg)
    y_test_gbc_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gbc),
            metrics.accuracy_score(y_test, y_test_gbc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.f1_score(y_test, y_test_gbc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.recall_score(y_test, y_test_gbc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.precision_score(y_test, y_test_gbc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gbc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradient Boosting Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Gradient Boosting 90',
        name,
        metrics.accuracy_score(y_test, y_test_gbc),
        metrics.f1_score(y_test, y_test_gbc, average='macro'),
        metrics.recall_score(y_test, y_test_gbc, average='macro'),
        metrics.precision_score(y_test, y_test_gbc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 4

=== RFECV Feature Selection with Gradient Boosting ===
Optimal number of features selected by RFECV: 4

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 3

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Gradient Boosting Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.890909  0.891302 0.891525   0.891228 0.986313
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.05, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.8}

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 1 cand

---

# Adaboost

### Adaboost with PCA 95

In [7]:
# Written by Ovi, 2025-07-07, AdaBoost classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(AdaBoostClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with AdaBoost ===")
ab_estimator = AdaBoostClassifier()

rfecv = RFECV(estimator=ab_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ab_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [50],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    ab = GridSearchCV(
        AdaBoostClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    ab.fit(X_train_cfg, y_train_cfg)

    y_train_ab = ab.predict(X_train_cfg)
    y_test_ab = ab.predict(X_test_cfg)
    y_train_ab_proba = ab.predict_proba(X_train_cfg)
    y_test_ab_proba = ab.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_ab),
            metrics.accuracy_score(y_test, y_test_ab),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.f1_score(y_test, y_test_ab, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.recall_score(y_test, y_test_ab, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.precision_score(y_test, y_test_ab, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ab_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
    storeResults(
        'AdaBoost 95',
        name,
        metrics.accuracy_score(y_test, y_test_ab),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(ab.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 2

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 2

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 2

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training      1.00  1.000000 1.000000   1.000000 1.000000
    Test      0.88  0.880017 0.880554   0.880037 0.986105
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=5), 'learning_rate': 1, 'n_estimators': 50}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  

---

# XGBoost

### XGBoost with PCA 90

In [8]:
# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                            X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with XGBoost ===")
xgb_estimator = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

rfecv = RFECV(estimator=xgb_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.01],
    'max_depth': [5],
    'subsample': [1.0],
    'min_child_weight': [5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xgb = xgb.predict(X_train_cfg)
    y_test_xgb = xgb.predict(X_test_cfg)
    y_train_xgb_proba = xgb.predict_proba(X_train_cfg)
    y_test_xgb_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_xgb),
            metrics.accuracy_score(y_test, y_test_xgb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.f1_score(y_test, y_test_xgb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.recall_score(y_test, y_test_xgb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.precision_score(y_test, y_test_xgb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xgb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost 90',
        name,
        metrics.accuracy_score(y_test, y_test_xgb),
        metrics.f1_score(y_test, y_test_xgb, average='macro'),
        metrics.recall_score(y_test, y_test_xgb, average='macro'),
        metrics.precision_score(y_test, y_test_xgb, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 2

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 2

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 2

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.949091  0.949146 0.948862   0.949699 0.995563
    Test  0.880000  0.879962 0.880161   0.880432 0.985874
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 1.0}

Running XGBoost with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall

---

# LightGBM

### LightGBM with PCA 99

In [9]:
# Written by Ovi, 2025-07-07, LightGBM classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(LGBMClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with LightGBM ===")
lgbm_estimator = LGBMClassifier()

rfecv = RFECV(estimator=lgbm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=lgbm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: LightGBM + GridSearchCV
print("\n=== LightGBM Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [200],
    'learning_rate': [0.1],
    'max_depth': [-1],
    'num_leaves': [15]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning LightGBM with {name} configuration...")
    lgbm = GridSearchCV(
        LGBMClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    lgbm.fit(X_train_cfg, y_train_cfg)

    y_train_lgbm = lgbm.predict(X_train_cfg)
    y_test_lgbm = lgbm.predict(X_test_cfg)
    y_train_lgbm_proba = lgbm.predict_proba(X_train_cfg)
    y_test_lgbm_proba = lgbm.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_lgbm),
            metrics.accuracy_score(y_test, y_test_lgbm),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_lgbm, average='macro'),
            metrics.f1_score(y_test, y_test_lgbm, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_lgbm, average='macro'),
            metrics.recall_score(y_test, y_test_lgbm, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_lgbm, average='macro'),
            metrics.precision_score(y_test, y_test_lgbm, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_lgbm_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lgbm_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nLightGBM Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lgbm_proba, multi_class='ovr', average='macro')
    storeResults(
        'LightGBM 99',
        name,
        metrics.accuracy_score(y_test, y_test_lgbm),
        metrics.f1_score(y_test, y_test_lgbm, average='macro'),
        metrics.recall_score(y_test, y_test_lgbm, average='macro'),
        metrics.precision_score(y_test, y_test_lgbm, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(lgbm.best_params_)



=== SelectKBest Feature Selection ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 660, number of used features: 1
[LightGBM] [Info] Start training from score -1.089562
[LightGBM] [Info] Start training from score -1.130948
[LightGBM] [Info] Start training from score -1.076139
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 660, number of used features: 1
[LightGBM] [Info] Start training from score -1.089562
[LightGBM] [Info] Start training from score -1.130948
[LightGBM] [Info] Start training from score -1.076139
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 

---

# Bagging

### Bagging classification with PCA 99

In [10]:
# Written by Ovi, 2025-07-07, Bagging classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier()), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Bagging ===")
# Use single DecisionTreeClassifier for RFECV to enable feature_importances_
tree_estimator = DecisionTreeClassifier(random_state=42)

rfecv = RFECV(
    estimator=tree_estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=tree_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: BaggingClassifier + GridSearchCV
print("\n=== Bagging Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [200],
    'max_samples': [0.6],
    'max_features': [1.0],
    'bootstrap': [True],
    'bootstrap_features': [False],
    'estimator': [ 
        DecisionTreeClassifier(max_depth=3, min_samples_split=2),
        DecisionTreeClassifier(max_depth=5, min_samples_split=5),
        DecisionTreeClassifier(max_depth=None, min_samples_split=10)
    ]
}


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Bagging with {name} configuration...")
    bag = GridSearchCV(
        BaggingClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    bag.fit(X_train_cfg, y_train_cfg)

    y_train_bag = bag.predict(X_train_cfg)
    y_test_bag = bag.predict(X_test_cfg)
    y_train_bag_proba = bag.predict_proba(X_train_cfg)
    y_test_bag_proba = bag.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_bag),
            metrics.accuracy_score(y_test, y_test_bag),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.f1_score(y_test, y_test_bag, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.recall_score(y_test, y_test_bag, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.precision_score(y_test, y_test_bag, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_bag_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nBagging Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro')
    storeResults(
        'Bagging 99',
        name,
        metrics.accuracy_score(y_test, y_test_bag),
        metrics.f1_score(y_test, y_test_bag, average='macro'),
        metrics.recall_score(y_test, y_test_bag, average='macro'),
        metrics.precision_score(y_test, y_test_bag, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(bag.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 5

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 5

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with Original Data configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.990303  0.990353 0.990217   0.990547 0.999817
    Test  0.883636  0.883897 0.884342   0.883881 0.986029
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(max_depth=5, min_samples_split=5), 'max_features': 1.0, 'max_samples': 0.6, 'n_estimators': 200}

Running Bagging with Normalized Data configuration...
Fitting 10 folds for each of 3 candidates, totalling

# Voting Classifier

In [11]:
# Written by Ovi, 2025-07-07, Voting Classifier with RF, AdaBoost and XGBoost
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
# Store different configurations
configurations = []

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)

# Step 4: Train base models
print("\n=== Training Base Models ===")

# Random Forest parameters
rf_param_grid = {
    'n_estimators': [100],
    'max_depth': [120],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['entropy']
}

# AdaBoost parameters
ab_param_grid = {
    'n_estimators': [50],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

# XGBoost parameters
xgb_param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.01],
    'max_depth': [5],
    'subsample': [1.0],
    'min_child_weight': [5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nTraining models with {name} configuration...")
    
    # Train Random Forest
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=10, n_jobs=-1, verbose=2)
    rf_grid.fit(X_train_cfg, y_train_cfg)
    best_rf = rf_grid.best_estimator_
    
    # Train AdaBoost
    ab_grid = GridSearchCV(AdaBoostClassifier(), ab_param_grid, cv=10, n_jobs=-1, verbose=2)
    ab_grid.fit(X_train_cfg, y_train_cfg)
    best_ab = ab_grid.best_estimator_
    
    # Train XGBoost
    xgb_grid = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        xgb_param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb_grid.fit(X_train_cfg, y_train_cfg)
    best_xgb = xgb_grid.best_estimator_
    
    # Create voting classifiers
    voting_configs = [
        ('hard', VotingClassifier(estimators=[('rf', best_rf), ('ab', best_ab), ('xgb', best_xgb)], voting='hard')),
        ('soft', VotingClassifier(estimators=[('rf', best_rf), ('ab', best_ab), ('xgb', best_xgb)], voting='soft')),
        ('weighted_hard', VotingClassifier(estimators=[('rf', best_rf), ('ab', best_ab), ('xgb', best_xgb)], voting='hard', weights=[0.3, 0.3, 0.4])),
        ('weighted_soft', VotingClassifier(estimators=[('rf', best_rf), ('ab', best_ab), ('xgb', best_xgb)], voting='soft', weights=[0.4, 0.3, 0.3]))
    ]
    
    for voting_type, voting_clf in voting_configs:
        print(f"\n=== Voting Classifier ({voting_type}) with {name} ===")
        
        # Fit voting classifier
        voting_clf.fit(X_train_cfg, y_train_cfg)
        
        # Predictions
        y_train_pred = voting_clf.predict(X_train_cfg)
        y_test_pred = voting_clf.predict(X_test_cfg)
        
        # Check if voting is hard or soft for probabilities
        if 'hard' in voting_type:
            # For hard voting, we cannot get probabilities, so set AUC-ROC to 0
            auc_train = 0
            auc_test = 0
        else:
            # For soft voting, we can get probabilities
            y_train_proba = voting_clf.predict_proba(X_train_cfg)
            y_test_proba = voting_clf.predict_proba(X_test_cfg)
            auc_train = metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_proba, multi_class='ovr', average='macro')
            auc_test = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_proba, multi_class='ovr', average='macro')
        
        # Calculate metrics
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [
                metrics.accuracy_score(y_train_cfg, y_train_pred),
                metrics.accuracy_score(y_test, y_test_pred),
            ],
            "F1 Score": [
                metrics.f1_score(y_train_cfg, y_train_pred, average='macro'),
                metrics.f1_score(y_test, y_test_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train_cfg, y_train_pred, average='macro'),
                metrics.recall_score(y_test, y_test_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train_cfg, y_train_pred, average='macro'),
                metrics.precision_score(y_test, y_test_pred, average='macro'),
            ],
            "AUC-ROC": [auc_train, auc_test]
        }
        
        df_metrics = pd.DataFrame(metrics_dict)
        print(f"\nVoting Classifier ({voting_type}) Performance Metrics")
        print(df_metrics.to_string(index=False))
        
        storeResults(
            f'Voting Classifier ({voting_type})',
            name,
            metrics.accuracy_score(y_test, y_test_pred),
            metrics.f1_score(y_test, y_test_pred, average='macro'),
            metrics.recall_score(y_test, y_test_pred, average='macro'),
            metrics.precision_score(y_test, y_test_pred, average='macro'),
            auc_test
        )


=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 3

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 3

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 3

=== Training Base Models ===

Training models with SelectKBest configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Voting Classifier (hard) with SelectKBest ===

Voting Classifier (hard) Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.953939  0.954074 0.953928   0.954430        0
    Test  0.930909  0.930989 0.930858   0.931263        0

=== Voting Classifier (soft) with SelectKBest ===

Voting Classifier (soft) Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.956364  0.9

# Stacking Classifier

In [12]:
# Written by Ovi, 2025-07-07, Stacking Classifier with RF, AdaBoost and XGBoost
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
# Store different configurations
configurations = []

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)

# Step 4: Train base models
print("\n=== Training Base Models with Stacking ===")

# Random Forest parameters
rf_param_grid = {
    'n_estimators': [100],
    'max_depth': [120],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['entropy']
}

# AdaBoost parameters
ab_param_grid = {
    'n_estimators': [50],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

# XGBoost parameters
xgb_param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.01],
    'max_depth': [5],
    'subsample': [1.0],
    'min_child_weight': [5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nTraining models with {name} configuration...")
    
    # Train Random Forest
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=10, n_jobs=-1, verbose=2)
    rf_grid.fit(X_train_cfg, y_train_cfg)
    best_rf = rf_grid.best_estimator_
    
    # Train AdaBoost
    ab_grid = GridSearchCV(AdaBoostClassifier(), ab_param_grid, cv=10, n_jobs=-1, verbose=2)
    ab_grid.fit(X_train_cfg, y_train_cfg)
    best_ab = ab_grid.best_estimator_
    
    # Train XGBoost
    xgb_grid = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        xgb_param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb_grid.fit(X_train_cfg, y_train_cfg)
    best_xgb = xgb_grid.best_estimator_
    
    # Create stacking classifier
    print(f"\n=== Stacking Classifier with {name} ===")
    
    # Base estimators
    base_estimators = [
        ('rf', best_rf),
        ('ab', best_ab),
        ('xgb', best_xgb)
    ]
    
    # Meta-learner
    meta_learner = LogisticRegression(max_iter=1000, random_state=42)
    
    # Stacking classifier
    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=meta_learner,
        cv=5,  # Use 5-fold cross-validation to train meta-learner
        stack_method='predict_proba',  # Use probabilities for meta-features
        n_jobs=-1
    )
    
    # Fit stacking classifier
    stacking_clf.fit(X_train_cfg, y_train_cfg)
    
    # Predictions
    y_train_pred = stacking_clf.predict(X_train_cfg)
    y_test_pred = stacking_clf.predict(X_test_cfg)
    y_train_proba = stacking_clf.predict_proba(X_train_cfg)
    y_test_proba = stacking_clf.predict_proba(X_test_cfg)
    
    # Calculate metrics
    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_pred),
            metrics.accuracy_score(y_test, y_test_pred),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_pred, average='macro'),
            metrics.f1_score(y_test, y_test_pred, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_pred, average='macro'),
            metrics.recall_score(y_test, y_test_pred, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_pred, average='macro'),
            metrics.precision_score(y_test, y_test_pred, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_proba, multi_class='ovr', average='macro'),
        ]
    }
    
    df_metrics = pd.DataFrame(metrics_dict)
    print(f"\nStacking Classifier Performance Metrics")
    print(df_metrics.to_string(index=False))
    
    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_proba, multi_class='ovr', average='macro')
    storeResults(
        'Stacking Classifier',
        name,
        metrics.accuracy_score(y_test, y_test_pred),
        metrics.f1_score(y_test, y_test_pred, average='macro'),
        metrics.recall_score(y_test, y_test_pred, average='macro'),
        metrics.precision_score(y_test, y_test_pred, average='macro'),
        auc_score
    )


=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 3

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 3

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 3

=== Training Base Models with Stacking ===

Training models with SelectKBest configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Stacking Classifier with SelectKBest ===

Stacking Classifier Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.953939  0.954054 0.953858   0.954369 0.993800
    Test  0.912727  0.912521 0.912197   0.913173 0.984694


---

# Results

In [13]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
result.to_csv('final_results/model_results.csv', index=False)
print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
sorted_result.to_csv('final_results/sorted_model_results.csv', index=False)
print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

top_per_model.to_csv('final_results/top_configurations.csv', index=False)
print("\nTop configuration per model saved to top_configurations.csv")



MODEL PERFORMANCE RESULTS
                         ML Model   Configuration Accuracy F1 Score  Recall Precision ROC_AUC
        Support Vector Machine 99   Original Data  32.000%  16.162% 33.333%   10.667% 53.290%
        Support Vector Machine 99 Normalized Data  89.091%  89.108% 89.180%   89.250% 95.783%
        Support Vector Machine 99     SelectKBest  89.455%  89.477% 89.559%   89.663% 95.161%
        Support Vector Machine 99           RFECV  89.455%  89.477% 89.559%   89.663% 95.250%
        Support Vector Machine 99             PCA  90.546%  90.557% 90.551%   90.567% 97.047%
                 Random Forest 99   Original Data  88.727%  88.740% 88.729%   88.799% 98.666%
                 Random Forest 99 Normalized Data  89.091%  89.098% 89.080%   89.159% 98.648%
                 Random Forest 99     SelectKBest  92.364%  92.365% 92.356%   92.378% 97.960%
                 Random Forest 99           RFECV  92.364%  92.365% 92.356%   92.378% 97.960%
                 Random Forest 99

In [14]:
import pandas as pd

# Read input CSV
df = pd.read_csv('final_results/top_configurations.csv')

# Sort by 'Accuracy' column in descending order
df_sorted = df.sort_values(by='Accuracy', ascending=False)

# Save the sorted DataFrame to a new CSV
df_sorted.to_csv('final_results/sorted_top_configurations.csv', index=False)

---

# END