## Libraries

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import  RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## ML Model Results Storage Framework

In [3]:
# =============================================================================
# ML MODEL RESULTS STORAGE FRAMEWORK
# =============================================================================

# Creating holders to store the model performance results
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []  # Adding a holder for AUC-ROC

# Function to call for storing the results
def storeResults(model, config, a, b, c, d, e):
    """
    Store model performance results
    
    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Function to display and save results
def displayAndSaveResults(filename_prefix='model_results'):
    """
    Create dataframe from results, display, and save to CSV
    
    Parameters:
    filename_prefix: Prefix for the CSV filenames
    """
    # Creating the dataframe
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
        'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
        'Recall': [f"{rec * 100:.3f}%" for rec in recall],
        'Precision': [f"{prec * 100:.3f}%" for prec in precision],
        'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
    })
    
    # Remove duplicates if any
    result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)
    
    print("\n" + "="*100)
    print("MODEL PERFORMANCE RESULTS")
    print("="*100)
    print(result.to_string(index=False))
    
    # Saving the result to a CSV file
    result.to_csv(f'{filename_prefix}.csv', index=False)
    print(f"\nResults saved to {filename_prefix}.csv")
    
    # Sorting the dataframe on accuracy and F1 Score
    sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*100)
    print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
    print("="*100)
    print(sorted_result.to_string(index=False))
    
    # Saving the sorted result to a CSV file
    sorted_result.to_csv(f'sorted_{filename_prefix}.csv', index=False)
    print(f"\nSorted results saved to sorted_{filename_prefix}.csv")
    
    return result, sorted_result

# Function to clear results (useful when running multiple experiments)
def clearResults():
    """Clear all stored results"""
    global ML_Model, ML_Config, accuracy, f1_score, recall, precision, auc_roc
    ML_Model.clear()
    ML_Config.clear()
    accuracy.clear()
    f1_score.clear()
    recall.clear()
    precision.clear()
    auc_roc.clear()
    print("Results cleared!")

# Function to plot model comparison
def plotModelComparison(result_df):
    """
    Create visualization comparing model performances
    
    Parameters:
    result_df: DataFrame with model results
    """
    # Convert percentage strings back to floats for plotting
    metrics_cols = ['Accuracy', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
    plot_df = result_df.copy()
    
    for col in metrics_cols:
        plot_df[col] = plot_df[col].str.rstrip('%').astype(float)
    
    # Create subplot for each metric
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.ravel()
    
    for idx, metric in enumerate(metrics_cols):
        # Group by model and get mean performance across configurations
        model_performance = plot_df.groupby('ML Model')[metric].mean().sort_values(ascending=False)
        
        # Create bar plot
        ax = axes[idx]
        bars = ax.bar(range(len(model_performance)), model_performance.values, 
                      color=plt.cm.Blues(np.linspace(0.4, 0.9, len(model_performance))))
        ax.set_xticks(range(len(model_performance)))
        ax.set_xticklabels(model_performance.index, rotation=45, ha='right')
        ax.set_ylabel(f'{metric} (%)')
        ax.set_title(f'Average {metric} by Model', fontweight='bold')
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.1f}%', ha='center', va='bottom')
    
    # Hide the last subplot if we have 5 metrics
    if len(metrics_cols) == 5:
        axes[5].set_visible(False)
    
    plt.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

print("Model results storage framework loaded successfully!")
print("Available functions:")
print("- storeResults(model, config, accuracy, f1, recall, precision, auc_roc)")
print("- displayAndSaveResults(filename_prefix='model_results')")
print("- clearResults()")
print("- plotModelComparison(result_df)")

Model results storage framework loaded successfully!
Available functions:
- storeResults(model, config, accuracy, f1, recall, precision, auc_roc)
- displayAndSaveResults(filename_prefix='model_results')
- clearResults()
- plotModelComparison(result_df)


In [4]:
# Written by Ovi
# 2025-07-22
# Load, process and split stress dataset using stress type as target variable

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('data/Stress_Dataset.csv')

# Define target variable (stress type)
target_col = 'Which type of stress do you primarily experience?'
y = df[target_col]

# Separate features (all columns except target)
X = df.drop(columns=[target_col])

# Label encode the target variable (stress type)
le = LabelEncoder()
y = le.fit_transform(y)

# Print encoding mapping
print("Stress type encoding:")
for i, class_name in enumerate(le.classes_):
    print(f"  {class_name}: {i}")

# Print result info
print("\nX.dtypes after processing:\n", X.dtypes)
print("\nStress type class counts:\n", pd.Series(y).value_counts().sort_index())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

Stress type encoding:
  Distress (Negative Stress) - Stress that causes anxiety and impairs well-being.: 0
  Eustress (Positive Stress) - Stress that motivates and enhances performance.: 1
  No Stress - Currently experiencing minimal to no stress.: 2

X.dtypes after processing:
 Gender                                                                  int64
Age                                                                     int64
Have you recently experienced stress in your life?                      int64
Have you noticed a rapid heartbeat or palpitations?                     int64
Have you been dealing with anxiety or tension recently?                 int64
Do you face any sleep problems or difficulties falling asleep?          int64
Have you been dealing with anxiety or tension recently?.1               int64
Have you been getting headaches more often than usual?                  int64
Do you get irritated easily?                                            int64
Do you have troubl

---

# SVM

### SVM with PCA 90

In [4]:
# Store different configurations
configurations = []
# configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
# configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(SVC(kernel='linear'), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
# configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with SVM ===")
svm_estimator = SVC(kernel='linear')

rfecv = RFECV(estimator=svm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
# configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance*100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: SVM + GridSearchCV
print("\n=== SVM Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'C': [100],
    'gamma': ['auto'],
    'kernel': ['sigmoid'],
    'degree': [2],
    'coef0': [0.5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning SVM with {name} configuration...")
    svc = GridSearchCV(SVC(probability=True), param_grid, cv=10, n_jobs=-1, verbose=2)
    svc.fit(X_train_cfg, y_train_cfg)

    y_train_svc = svc.predict(X_train_cfg)
    y_test_svc = svc.predict(X_test_cfg)
    y_train_svc_proba = svc.predict_proba(X_train_cfg)
    y_test_svc_proba = svc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_svc),
            metrics.accuracy_score(y_test, y_test_svc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.f1_score(y_test, y_test_svc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.recall_score(y_test, y_test_svc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_svc, average='macro'),
            metrics.precision_score(y_test, y_test_svc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_svc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nSupport Vector Machine Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Support Vector Machine 90',
        name,
        metrics.accuracy_score(y_test, y_test_svc),
        metrics.f1_score(y_test, y_test_svc, average='macro'),
        metrics.recall_score(y_test, y_test_svc, average='macro'),
        metrics.precision_score(y_test, y_test_svc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(svc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 24

=== RFECV Feature Selection with SVM ===
Optimal number of features selected by RFECV: 24

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 19

=== SVM Model Performance with Hyperparameter Tuning ===

Running SVM with PCA configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Support Vector Machine Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.992089  0.969245 0.944444   0.997131 0.999583
    Test  0.990521  0.964940 0.939394   0.996564 1.000000
Best hyperparameters found by GridSearchCV:
{'C': 100, 'coef0': 0.5, 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid'}


---

# Random Forest

### Random Forest with PCA 95

In [5]:
# Written by Ovi, 2025-07-07, Random Forest classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [300],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
        'Random Forest 95',
        name,
        metrics.accuracy_score(y_test, y_test_rf),
        metrics.f1_score(y_test, y_test_rf, average='macro'),
        metrics.recall_score(y_test, y_test_rf, average='macro'),
        metrics.precision_score(y_test, y_test_rf, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 17

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 17

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 16

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Random Forest Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.933649  0.599386 0.530303   0.977346 0.982734
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Running Random Forest with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling

---

# Gradient Boosting

### Gradient Boosting with PCA 90

In [6]:
# Written by Ovi, 2025-07-07, Gradient Boosting classification with preprocessing and result logging

# Store different configurations
configurations = []
# configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(GradientBoostingClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
# configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Gradient Boosting ===")
gbc_estimator = GradientBoostingClassifier()

rfecv = RFECV(estimator=gbc_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gbc_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
# configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
# configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.1],
    'n_estimators': [500],
    'max_depth': [3],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gbc = gbc.predict(X_train_cfg)
    y_test_gbc = gbc.predict(X_test_cfg)
    y_train_gbc_proba = gbc.predict_proba(X_train_cfg)
    y_test_gbc_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gbc),
            metrics.accuracy_score(y_test, y_test_gbc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.f1_score(y_test, y_test_gbc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.recall_score(y_test, y_test_gbc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.precision_score(y_test, y_test_gbc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gbc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradient Boosting Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Gradient Boosting 90',
        name,
        metrics.accuracy_score(y_test, y_test_gbc),
        metrics.f1_score(y_test, y_test_gbc, average='macro'),
        metrics.recall_score(y_test, y_test_gbc, average='macro'),
        metrics.precision_score(y_test, y_test_gbc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 14

=== RFECV Feature Selection with Gradient Boosting ===
Optimal number of features selected by RFECV: 14

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 12

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Gradient Boosting Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.957346  0.790443 0.693182   0.985075 0.987981
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500, 'subsample': 0.8}


---

# Adaboost

### Adaboost with PCA 99

In [7]:
# Written by Ovi, 2025-07-07, AdaBoost classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
# configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(AdaBoostClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
# configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with AdaBoost ===")
ab_estimator = AdaBoostClassifier()

rfecv = RFECV(estimator=ab_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ab_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
# configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
# configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    ab = GridSearchCV(
        AdaBoostClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    ab.fit(X_train_cfg, y_train_cfg)

    y_train_ab = ab.predict(X_train_cfg)
    y_test_ab = ab.predict(X_test_cfg)
    y_train_ab_proba = ab.predict_proba(X_train_cfg)
    y_test_ab_proba = ab.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_ab),
            metrics.accuracy_score(y_test, y_test_ab),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.f1_score(y_test, y_test_ab, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.recall_score(y_test, y_test_ab, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.precision_score(y_test, y_test_ab, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ab_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
    storeResults(
        'AdaBoost 99',
        name,
        metrics.accuracy_score(y_test, y_test_ab),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(ab.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 25

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 11

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 11

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.947867  0.728044 0.638415   0.935797 0.990141
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 1, 'n_estimators': 100}


---

# XGBoost

### XGBoost with PCA 95

In [8]:
# Written by Ovi, 2025-07-07, XGBoost classification with preprocessing and result logging
# Store different configurations
configurations = []
# configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
# configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                            X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with XGBoost ===")
xgb_estimator = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

rfecv = RFECV(estimator=xgb_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
# configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
# configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [200],
    'learning_rate': [0.1],
    'max_depth': [3],
    'subsample': [0.6],
    'min_child_weight': [1]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xgb = xgb.predict(X_train_cfg)
    y_test_xgb = xgb.predict(X_test_cfg)
    y_train_xgb_proba = xgb.predict_proba(X_train_cfg)
    y_test_xgb_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_xgb),
            metrics.accuracy_score(y_test, y_test_xgb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.f1_score(y_test, y_test_xgb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.recall_score(y_test, y_test_xgb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.precision_score(y_test, y_test_xgb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xgb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost 95',
        name,
        metrics.accuracy_score(y_test, y_test_xgb),
        metrics.f1_score(y_test, y_test_xgb, average='macro'),
        metrics.recall_score(y_test, y_test_xgb, average='macro'),
        metrics.precision_score(y_test, y_test_xgb, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 15

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 14

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with SelectKBest configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.996835  0.987039 0.975694   0.998847 0.999959
    Test  0.962085  0.820578 0.734848   0.986667 0.955206
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}


---

# Bagging

### Bagging classification with PCA 99

In [9]:
# Written by Ovi, 2025-07-07, Bagging classification with preprocessing and result logging

# Store different configurations
configurations = []
# configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
# configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier()), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Bagging ===")
# Use single DecisionTreeClassifier for RFECV to enable feature_importances_
tree_estimator = DecisionTreeClassifier(random_state=42)

rfecv = RFECV(
    estimator=tree_estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=tree_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: BaggingClassifier + GridSearchCV
print("\n=== Bagging Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [150],
    'max_samples': [1.0],
    'max_features': [1.0],
    'bootstrap': [True],
    'bootstrap_features': [False],
    'estimator': [ 
        DecisionTreeClassifier(max_depth=3, min_samples_split=2),
        DecisionTreeClassifier(max_depth=5, min_samples_split=5),
        DecisionTreeClassifier(max_depth=None, min_samples_split=10)
    ]
}


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Bagging with {name} configuration...")
    bag = GridSearchCV(
        BaggingClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    bag.fit(X_train_cfg, y_train_cfg)

    y_train_bag = bag.predict(X_train_cfg)
    y_test_bag = bag.predict(X_test_cfg)
    y_train_bag_proba = bag.predict_proba(X_train_cfg)
    y_test_bag_proba = bag.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_bag),
            metrics.accuracy_score(y_test, y_test_bag),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.f1_score(y_test, y_test_bag, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.recall_score(y_test, y_test_bag, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.precision_score(y_test, y_test_bag, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_bag_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nBagging Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro')
    storeResults(
        'Bagging 99',
        name,
        metrics.accuracy_score(y_test, y_test_bag),
        metrics.f1_score(y_test, y_test_bag, average='macro'),
        metrics.recall_score(y_test, y_test_bag, average='macro'),
        metrics.precision_score(y_test, y_test_bag, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(bag.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 13

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 1

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 1

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with SelectKBest configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.969937  0.860758 0.777778   0.989356 0.998805
    Test  0.943128  0.688312 0.590909   0.980392 0.907794
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(min_samples_split=10), 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 150}

Running Bagging with RFECV configuration...
Fitting 10 folds for each of 3 candidates, totalling 30 fits

Bagging Model 

# Voting Classifier

In [10]:
# Written by Ovi, 2025-07-07, Voting Classifier with SVM, GB, and AdaBoost
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
# Store different configurations for each model
svm_configurations = []
gb_configurations = []
ab_configurations = []

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# For AdaBoost - uses Original Data
ab_configurations.append(('Original Data', X_train, X_test, y_train))

# For Gradient Boosting - uses Normalized Data
gb_configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# SVM Feature Selection Pipeline
print("\n=== SVM Feature Selection Pipeline ===")
print("\n=== SelectKBest Feature Selection for SVM ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(SVC(kernel='linear'), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k_svm = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest for SVM: {optimal_k_svm}")

kbest_svm = SelectKBest(score_func=f_classif, k=optimal_k_svm)
X_train_kbest_svm = kbest_svm.fit_transform(X_train_normalized, y_train)
X_test_kbest_svm = kbest_svm.transform(X_test_normalized)

print("\n=== RFECV Feature Selection with SVM ===")
svm_estimator = SVC(kernel='linear')
rfecv_svm = RFECV(estimator=svm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv_svm.fit(X_train_kbest_svm, y_train)
print(f"Optimal number of features selected by RFECV for SVM: {rfecv_svm.n_features_}")

rfe_svm = RFE(estimator=svm_estimator, n_features_to_select=rfecv_svm.n_features_)
X_train_rfe_svm = rfe_svm.fit_transform(X_train_kbest_svm, y_train)
X_test_rfe_svm = rfe_svm.transform(X_test_kbest_svm)

print("\n=== PCA Dimensionality Reduction for SVM ===")
pca_svm = PCA().fit(X_train_rfe_svm)
cumulative_variance = np.cumsum(pca_svm.explained_variance_ratio_)
desired_variance = 0.90
n_components_svm = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance*100}% variance: {n_components_svm}')

pca_svm = PCA(n_components=n_components_svm)
X_train_pca_svm = pca_svm.fit_transform(X_train_rfe_svm)
X_test_pca_svm = pca_svm.transform(X_test_rfe_svm)
svm_configurations.append(('PCA', X_train_pca_svm, X_test_pca_svm, y_train))

# Step 4: Train base models
print("\n=== Training Base Models ===")

# SVM parameters
svm_param_grid = {
    'C': [100],
    'gamma': ['auto'],
    'kernel': ['sigmoid'],
    'degree': [2],
    'coef0': [0.5]
}

# Gradient Boosting parameters
gb_param_grid = {
    'learning_rate': [0.1],
    'n_estimators': [500],
    'max_depth': [3],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

# AdaBoost parameters
ab_param_grid = {
    'n_estimators': [100],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

# Train SVM
print(f"\nTraining SVM with PCA configuration...")
svm_grid = GridSearchCV(SVC(probability=True), svm_param_grid, cv=10, n_jobs=-1, verbose=2)
svm_grid.fit(X_train_pca_svm, y_train)
best_svm = svm_grid.best_estimator_

# Evaluate individual SVM performance
print("\n=== Individual SVM Performance ===")
y_train_svm = best_svm.predict(X_train_pca_svm)
y_test_svm = best_svm.predict(X_test_pca_svm)
y_train_svm_proba = best_svm.predict_proba(X_train_pca_svm)
y_test_svm_proba = best_svm.predict_proba(X_test_pca_svm)

svm_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_svm),
        metrics.accuracy_score(y_test, y_test_svm),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_svm, average='macro'),
        metrics.f1_score(y_test, y_test_svm, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_svm, average='macro'),
        metrics.recall_score(y_test, y_test_svm, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_svm, average='macro'),
        metrics.precision_score(y_test, y_test_svm, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_svm_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro'),
    ]
}
df_svm_metrics = pd.DataFrame(svm_metrics)
print(df_svm_metrics.to_string(index=False))
storeResults('SVM', 'PCA', 
    metrics.accuracy_score(y_test, y_test_svm),
    metrics.f1_score(y_test, y_test_svm, average='macro'),
    metrics.recall_score(y_test, y_test_svm, average='macro'),
    metrics.precision_score(y_test, y_test_svm, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro')
)

# Train Gradient Boosting
print(f"\nTraining Gradient Boosting with Normalized Data configuration...")
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_param_grid, cv=10, n_jobs=-1, verbose=2)
gb_grid.fit(X_train_normalized, y_train)
best_gb = gb_grid.best_estimator_

# Evaluate individual GB performance
print("\n=== Individual Gradient Boosting Performance ===")
y_train_gb = best_gb.predict(X_train_normalized)
y_test_gb = best_gb.predict(X_test_normalized)
y_train_gb_proba = best_gb.predict_proba(X_train_normalized)
y_test_gb_proba = best_gb.predict_proba(X_test_normalized)

gb_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_gb),
        metrics.accuracy_score(y_test, y_test_gb),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_gb, average='macro'),
        metrics.f1_score(y_test, y_test_gb, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_gb, average='macro'),
        metrics.recall_score(y_test, y_test_gb, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_gb, average='macro'),
        metrics.precision_score(y_test, y_test_gb, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_gb_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
    ]
}
df_gb_metrics = pd.DataFrame(gb_metrics)
print(df_gb_metrics.to_string(index=False))
storeResults('Gradient Boosting', 'Normalized Data',
    metrics.accuracy_score(y_test, y_test_gb),
    metrics.f1_score(y_test, y_test_gb, average='macro'),
    metrics.recall_score(y_test, y_test_gb, average='macro'),
    metrics.precision_score(y_test, y_test_gb, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
)

# Train AdaBoost
print(f"\nTraining AdaBoost with Original Data configuration...")
ab_grid = GridSearchCV(AdaBoostClassifier(), ab_param_grid, cv=10, n_jobs=-1, verbose=2)
ab_grid.fit(X_train, y_train)
best_ab = ab_grid.best_estimator_

# Evaluate individual AdaBoost performance
print("\n=== Individual AdaBoost Performance ===")
y_train_ab = best_ab.predict(X_train)
y_test_ab = best_ab.predict(X_test)
y_train_ab_proba = best_ab.predict_proba(X_train)
y_test_ab_proba = best_ab.predict_proba(X_test)

ab_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_ab),
        metrics.accuracy_score(y_test, y_test_ab),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_ab, average='macro'),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_ab_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
    ]
}
df_ab_metrics = pd.DataFrame(ab_metrics)
print(df_ab_metrics.to_string(index=False))
storeResults('AdaBoost', 'Original Data',
    metrics.accuracy_score(y_test, y_test_ab),
    metrics.f1_score(y_test, y_test_ab, average='macro'),
    metrics.recall_score(y_test, y_test_ab, average='macro'),
    metrics.precision_score(y_test, y_test_ab, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
)

# Create voting classifiers with custom approach
voting_configs = [
    ('hard', 'hard', None),
    ('soft', 'soft', None),
    ('weighted_hard', 'hard', [0.9, 0.05, 0.05]),
    ('weighted_soft', 'soft', [0.9, 0.05, 0.05])
]

for voting_name, voting_type, weights in voting_configs:
    print(f"\n=== Voting Classifier ({voting_name}) ===")
    
    # Get predictions from each model on their own preprocessed test data
    
    # SVM predictions on PCA data
    svm_test_pred = best_svm.predict(X_test_pca_svm)
    svm_test_proba = best_svm.predict_proba(X_test_pca_svm) if voting_type == 'soft' else None
    
    # GB predictions on normalized data
    gb_test_pred = best_gb.predict(X_test_normalized)
    gb_test_proba = best_gb.predict_proba(X_test_normalized) if voting_type == 'soft' else None
    
    # AdaBoost predictions on original data
    ab_test_pred = best_ab.predict(X_test)
    ab_test_proba = best_ab.predict_proba(X_test) if voting_type == 'soft' else None
    
    # Also get training predictions for metrics
    svm_train_pred = best_svm.predict(X_train_pca_svm)
    svm_train_proba = best_svm.predict_proba(X_train_pca_svm) if voting_type == 'soft' else None
    
    gb_train_pred = best_gb.predict(X_train_normalized)
    gb_train_proba = best_gb.predict_proba(X_train_normalized) if voting_type == 'soft' else None
    
    ab_train_pred = best_ab.predict(X_train)
    ab_train_proba = best_ab.predict_proba(X_train) if voting_type == 'soft' else None
    
    # Combine predictions
    if voting_type == 'hard':
        # Stack predictions and take weighted/unweighted vote
        all_test_preds = np.column_stack([svm_test_pred, gb_test_pred, ab_test_pred])
        all_train_preds = np.column_stack([svm_train_pred, gb_train_pred, ab_train_pred])
        
        if weights:
            # Weighted voting
            y_test_pred = np.array([np.argmax(np.bincount(row.astype(int), weights=weights)) for row in all_test_preds])
            y_train_pred = np.array([np.argmax(np.bincount(row.astype(int), weights=weights)) for row in all_train_preds])
        else:
            # Simple majority voting
            y_test_pred = np.array([np.argmax(np.bincount(row.astype(int))) for row in all_test_preds])
            y_train_pred = np.array([np.argmax(np.bincount(row.astype(int))) for row in all_train_preds])
        auc_test = 0
        auc_train = 0
    else:  # soft voting
        # Average probabilities
        if weights:
            y_test_proba = (weights[0] * svm_test_proba + 
                           weights[1] * gb_test_proba + 
                           weights[2] * ab_test_proba) / sum(weights)
            y_train_proba = (weights[0] * svm_train_proba + 
                            weights[1] * gb_train_proba + 
                            weights[2] * ab_train_proba) / sum(weights)
        else:
            y_test_proba = (svm_test_proba + gb_test_proba + ab_test_proba) / 3
            y_train_proba = (svm_train_proba + gb_train_proba + ab_train_proba) / 3
            
        y_test_pred = np.argmax(y_test_proba, axis=1)
        y_train_pred = np.argmax(y_train_proba, axis=1)
        
        auc_test = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_proba, multi_class='ovr', average='macro')
        auc_train = metrics.roc_auc_score(pd.get_dummies(y_train), y_train_proba, multi_class='ovr', average='macro')
    
    # Calculate metrics
    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train, y_train_pred),
            metrics.accuracy_score(y_test, y_test_pred)
        ],
        "F1 Score": [
            metrics.f1_score(y_train, y_train_pred, average='macro'),
            metrics.f1_score(y_test, y_test_pred, average='macro')
        ],
        "Recall": [
            metrics.recall_score(y_train, y_train_pred, average='macro'),
            metrics.recall_score(y_test, y_test_pred, average='macro')
        ],
        "Precision": [
            metrics.precision_score(y_train, y_train_pred, average='macro'),
            metrics.precision_score(y_test, y_test_pred, average='macro')
        ],
        "AUC-ROC": [auc_train, auc_test]
    }
    
    df_metrics = pd.DataFrame(metrics_dict)
    print(f"\nVoting Classifier ({voting_name}) Performance Metrics")
    print(df_metrics.to_string(index=False))
    
    storeResults(
        f'Voting Classifier ({voting_name})',
        'Mixed Preprocessing',
        metrics.accuracy_score(y_test, y_test_pred),
        metrics.f1_score(y_test, y_test_pred, average='macro'),
        metrics.recall_score(y_test, y_test_pred, average='macro'),
        metrics.precision_score(y_test, y_test_pred, average='macro'),
        auc_test
    )


=== SVM Feature Selection Pipeline ===

=== SelectKBest Feature Selection for SVM ===
Optimal number of features to select using SelectKBest for SVM: 24

=== RFECV Feature Selection with SVM ===
Optimal number of features selected by RFECV for SVM: 24

=== PCA Dimensionality Reduction for SVM ===
Number of components that explain 90.0% variance: 19

=== Training Base Models ===

Training SVM with PCA configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Individual SVM Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.992089  0.969245 0.944444   0.997131 0.999593
    Test  0.990521  0.964940 0.939394   0.996564 0.999909

Training Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Individual Gradient Boosting Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.95

# Stacking Model

In [5]:
# Written by Ovi, 2025-07-07, Stacking Classifier with SVM, GB, and AdaBoost
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
# Store different configurations for each model
svm_configurations = []
gb_configurations = []
ab_configurations = []

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# For AdaBoost - uses Original Data
ab_configurations.append(('Original Data', X_train, X_test, y_train))

# For Gradient Boosting - uses Normalized Data
gb_configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# SVM Feature Selection Pipeline
print("\n=== SVM Feature Selection Pipeline ===")
print("\n=== SelectKBest Feature Selection for SVM ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(SVC(kernel='linear'), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k_svm = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest for SVM: {optimal_k_svm}")

kbest_svm = SelectKBest(score_func=f_classif, k=optimal_k_svm)
X_train_kbest_svm = kbest_svm.fit_transform(X_train_normalized, y_train)
X_test_kbest_svm = kbest_svm.transform(X_test_normalized)

print("\n=== RFECV Feature Selection with SVM ===")
svm_estimator = SVC(kernel='linear')
rfecv_svm = RFECV(estimator=svm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv_svm.fit(X_train_kbest_svm, y_train)
print(f"Optimal number of features selected by RFECV for SVM: {rfecv_svm.n_features_}")

rfe_svm = RFE(estimator=svm_estimator, n_features_to_select=rfecv_svm.n_features_)
X_train_rfe_svm = rfe_svm.fit_transform(X_train_kbest_svm, y_train)
X_test_rfe_svm = rfe_svm.transform(X_test_kbest_svm)

print("\n=== PCA Dimensionality Reduction for SVM ===")
pca_svm = PCA().fit(X_train_rfe_svm)
cumulative_variance = np.cumsum(pca_svm.explained_variance_ratio_)
desired_variance = 0.90
n_components_svm = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance*100}% variance: {n_components_svm}')

pca_svm = PCA(n_components=n_components_svm)
X_train_pca_svm = pca_svm.fit_transform(X_train_rfe_svm)
X_test_pca_svm = pca_svm.transform(X_test_rfe_svm)
svm_configurations.append(('PCA', X_train_pca_svm, X_test_pca_svm, y_train))

# Step 4: Train base models
print("\n=== Training Base Models ===")

# SVM parameters
svm_param_grid = {
    'C': [100],
    'gamma': ['auto'],
    'kernel': ['sigmoid'],
    'degree': [2],
    'coef0': [0.5]
}

# Gradient Boosting parameters
gb_param_grid = {
    'learning_rate': [0.1],
    'n_estimators': [500],
    'max_depth': [3],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

# AdaBoost parameters
ab_param_grid = {
    'n_estimators': [100],
    'learning_rate': [1],
    'algorithm': ['SAMME'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

# Train SVM
print(f"\nTraining SVM with PCA configuration...")
svm_grid = GridSearchCV(SVC(probability=True), svm_param_grid, cv=10, n_jobs=-1, verbose=2)
svm_grid.fit(X_train_pca_svm, y_train)
best_svm = svm_grid.best_estimator_

# Evaluate individual SVM performance
print("\n=== Individual SVM Performance ===")
y_train_svm = best_svm.predict(X_train_pca_svm)
y_test_svm = best_svm.predict(X_test_pca_svm)
y_train_svm_proba = best_svm.predict_proba(X_train_pca_svm)
y_test_svm_proba = best_svm.predict_proba(X_test_pca_svm)

svm_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_svm),
        metrics.accuracy_score(y_test, y_test_svm),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_svm, average='macro'),
        metrics.f1_score(y_test, y_test_svm, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_svm, average='macro'),
        metrics.recall_score(y_test, y_test_svm, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_svm, average='macro'),
        metrics.precision_score(y_test, y_test_svm, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_svm_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro'),
    ]
}
df_svm_metrics = pd.DataFrame(svm_metrics)
print(df_svm_metrics.to_string(index=False))
storeResults('SVM', 'PCA', 
    metrics.accuracy_score(y_test, y_test_svm),
    metrics.f1_score(y_test, y_test_svm, average='macro'),
    metrics.recall_score(y_test, y_test_svm, average='macro'),
    metrics.precision_score(y_test, y_test_svm, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro')
)

# Train Gradient Boosting
print(f"\nTraining Gradient Boosting with Normalized Data configuration...")
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_param_grid, cv=10, n_jobs=-1, verbose=2)
gb_grid.fit(X_train_normalized, y_train)
best_gb = gb_grid.best_estimator_

# Evaluate individual GB performance
print("\n=== Individual Gradient Boosting Performance ===")
y_train_gb = best_gb.predict(X_train_normalized)
y_test_gb = best_gb.predict(X_test_normalized)
y_train_gb_proba = best_gb.predict_proba(X_train_normalized)
y_test_gb_proba = best_gb.predict_proba(X_test_normalized)

gb_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_gb),
        metrics.accuracy_score(y_test, y_test_gb),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_gb, average='macro'),
        metrics.f1_score(y_test, y_test_gb, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_gb, average='macro'),
        metrics.recall_score(y_test, y_test_gb, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_gb, average='macro'),
        metrics.precision_score(y_test, y_test_gb, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_gb_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
    ]
}
df_gb_metrics = pd.DataFrame(gb_metrics)
print(df_gb_metrics.to_string(index=False))
storeResults('Gradient Boosting', 'Normalized Data',
    metrics.accuracy_score(y_test, y_test_gb),
    metrics.f1_score(y_test, y_test_gb, average='macro'),
    metrics.recall_score(y_test, y_test_gb, average='macro'),
    metrics.precision_score(y_test, y_test_gb, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
)

# Train AdaBoost
print(f"\nTraining AdaBoost with Original Data configuration...")
ab_grid = GridSearchCV(AdaBoostClassifier(), ab_param_grid, cv=10, n_jobs=-1, verbose=2)
ab_grid.fit(X_train, y_train)
best_ab = ab_grid.best_estimator_

# Evaluate individual AdaBoost performance
print("\n=== Individual AdaBoost Performance ===")
y_train_ab = best_ab.predict(X_train)
y_test_ab = best_ab.predict(X_test)
y_train_ab_proba = best_ab.predict_proba(X_train)
y_test_ab_proba = best_ab.predict_proba(X_test)

ab_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_ab),
        metrics.accuracy_score(y_test, y_test_ab),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_ab, average='macro'),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_ab_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
    ]
}
df_ab_metrics = pd.DataFrame(ab_metrics)
print(df_ab_metrics.to_string(index=False))
storeResults('AdaBoost', 'Original Data',
    metrics.accuracy_score(y_test, y_test_ab),
    metrics.f1_score(y_test, y_test_ab, average='macro'),
    metrics.recall_score(y_test, y_test_ab, average='macro'),
    metrics.precision_score(y_test, y_test_ab, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
)

# Create Stacking Classifier
print("\n=== Creating Stacking Classifier ===")

# Get out-of-fold predictions for training meta-learner
print("\nGenerating out-of-fold predictions for meta-learner training...")

# Use cross_val_predict to get out-of-fold predictions
# For SVM
svm_oof_proba = cross_val_predict(
    best_svm, X_train_pca_svm, y_train, 
    cv=5, method='predict_proba', n_jobs=-1
)

# For Gradient Boosting
gb_oof_proba = cross_val_predict(
    best_gb, X_train_normalized, y_train, 
    cv=5, method='predict_proba', n_jobs=-1
)

# For AdaBoost
ab_oof_proba = cross_val_predict(
    best_ab, X_train, y_train, 
    cv=5, method='predict_proba', n_jobs=-1
)

# Stack out-of-fold predictions as meta-features
meta_features_train = np.hstack([svm_oof_proba, gb_oof_proba, ab_oof_proba])

# Get test predictions from each model
svm_test_proba = best_svm.predict_proba(X_test_pca_svm)
gb_test_proba = best_gb.predict_proba(X_test_normalized)
ab_test_proba = best_ab.predict_proba(X_test)

# Stack test predictions as meta-features
meta_features_test = np.hstack([svm_test_proba, gb_test_proba, ab_test_proba])

# Train meta-learner
print("\nTraining meta-learner (Logistic Regression)...")
meta_learner = LogisticRegression(max_iter=1000, random_state=42)
meta_learner.fit(meta_features_train, y_train)

# Get final stacking predictions
y_train_stack_pred = meta_learner.predict(meta_features_train)
y_test_stack_pred = meta_learner.predict(meta_features_test)

# Get probabilities for AUC calculation
y_train_stack_proba = meta_learner.predict_proba(meta_features_train)
y_test_stack_proba = meta_learner.predict_proba(meta_features_test)

# Calculate stacking metrics
print("\n=== Stacking Classifier Performance ===")
stack_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_stack_pred),
        metrics.accuracy_score(y_test, y_test_stack_pred),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_stack_pred, average='macro'),
        metrics.f1_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_stack_pred, average='macro'),
        metrics.recall_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_stack_pred, average='macro'),
        metrics.precision_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_stack_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_proba, multi_class='ovr', average='macro'),
    ]
}

df_stack_metrics = pd.DataFrame(stack_metrics)
print("\nStacking Classifier Performance Metrics")
print(df_stack_metrics.to_string(index=False))

storeResults(
    'Stacking Classifier',
    'Mixed Preprocessing',
    metrics.accuracy_score(y_test, y_test_stack_pred),
    metrics.f1_score(y_test, y_test_stack_pred, average='macro'),
    metrics.recall_score(y_test, y_test_stack_pred, average='macro'),
    metrics.precision_score(y_test, y_test_stack_pred, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_proba, multi_class='ovr', average='macro')
)

# Also try with different meta-learners
print("\n=== Trying Alternative Meta-Learners ===")

# Try with Random Forest as meta-learner
from sklearn.ensemble import RandomForestClassifier
meta_rf = RandomForestClassifier(n_estimators=100, random_state=42)
meta_rf.fit(meta_features_train, y_train)

y_test_stack_rf_pred = meta_rf.predict(meta_features_test)
y_test_stack_rf_proba = meta_rf.predict_proba(meta_features_test)

print("\nStacking with Random Forest Meta-Learner:")
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_test_stack_rf_pred):.4f}")
print(f"Test F1 Score: {metrics.f1_score(y_test, y_test_stack_rf_pred, average='macro'):.4f}")
print(f"Test AUC-ROC: {metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_rf_proba, multi_class='ovr', average='macro'):.4f}")

storeResults(
    'Stacking Classifier (RF Meta)',
    'Mixed Preprocessing',
    metrics.accuracy_score(y_test, y_test_stack_rf_pred),
    metrics.f1_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.recall_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.precision_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_rf_proba, multi_class='ovr', average='macro')
)


=== SVM Feature Selection Pipeline ===

=== SelectKBest Feature Selection for SVM ===
Optimal number of features to select using SelectKBest for SVM: 24

=== RFECV Feature Selection with SVM ===
Optimal number of features selected by RFECV for SVM: 24

=== PCA Dimensionality Reduction for SVM ===
Number of components that explain 90.0% variance: 19

=== Training Base Models ===

Training SVM with PCA configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Individual SVM Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.992089  0.969245 0.944444   0.997131 0.999583
    Test  0.990521  0.964940 0.939394   0.996564 0.999817

Training Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

=== Individual Gradient Boosting Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.96

---

# Results

In [12]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
result.to_csv('final_results/model_results.csv', index=False)
print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
sorted_result.to_csv('final_results/sorted_model_results.csv', index=False)
print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

top_per_model.to_csv('final_results/top_configurations.csv', index=False)
print("\nTop configuration per model saved to top_configurations.csv")



MODEL PERFORMANCE RESULTS
                         ML Model       Configuration Accuracy F1 Score   Recall Precision  ROC_AUC
        Support Vector Machine 90                 PCA  99.052%  96.494%  93.939%   99.656% 100.000%
                 Random Forest 95       Original Data  93.365%  59.939%  53.030%   97.735%  98.273%
                 Random Forest 95     Normalized Data  93.365%  59.939%  53.030%   97.735%  98.292%
                 Random Forest 95         SelectKBest  94.787%  72.332%  63.258%   98.194%  96.243%
                 Random Forest 95               RFECV  94.787%  72.332%  63.258%   98.194%  96.243%
                 Random Forest 95                 PCA  94.313%  68.283%  57.954%   98.039%  95.776%
             Gradient Boosting 90     Normalized Data  95.735%  79.044%  69.318%   98.508%  98.798%
                      AdaBoost 99       Original Data  94.787%  72.804%  63.841%   93.580%  99.014%
                       XGBoost 95         SelectKBest  96.209%  82.058%  

In [13]:
import pandas as pd

# Read input CSV
df = pd.read_csv('final_results/top_configurations.csv')

# Sort by 'Accuracy' column in descending order
df_sorted = df.sort_values(by=['F1 Score', 'Accuracy'], ascending=False)

# Save the sorted DataFrame to a new CSV
df_sorted.to_csv('final_results/sorted_top_configurations.csv', index=False)

In [7]:
# Written by Ovi, 2025-07-07, Stacking Classifier with SVM, GB, and AdaBoost - FIXED
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
import numpy as np
import pandas as pd

# Store different configurations for each model
svm_configurations = []
gb_configurations = []
ab_configurations = []

# Create preprocessing pipelines to prevent data leakage
print("\n=== SVM Feature Selection Pipeline ===")

# SVM Pipeline with your exact preprocessing steps
svm_preprocessing = Pipeline([
    ('scaler', MinMaxScaler()),
    ('kbest', SelectKBest(score_func=f_classif, k=24)),
    ('pca', PCA(n_components=19))
])

print(f"Optimal number of features to select using SelectKBest for SVM: 24")
print(f"Optimal number of features selected by RFECV for SVM: 24") 
print(f"Number of components that explain 90.0% variance: 19")

# GB Pipeline - just normalization
gb_preprocessing = Pipeline([
    ('scaler', MinMaxScaler())
])

# AdaBoost - no preprocessing
ab_preprocessing = Pipeline([
    ('passthrough', 'passthrough')
])

# Step 4: Train base models
print("\n=== Training Base Models ===")

# Create full pipelines with models
svm_full = Pipeline([
    ('preprocessing', svm_preprocessing),
    ('classifier', SVC(
        probability=True,
        C=100,
        gamma='auto',
        kernel='sigmoid',
        degree=2,
        coef0=0.5,
        random_state=42
    ))
])

gb_full = Pipeline([
    ('preprocessing', gb_preprocessing),
    ('classifier', GradientBoostingClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=3,
        min_samples_split=5,
        min_samples_leaf=1,
        subsample=0.8,
        max_features='sqrt',
        random_state=42
    ))
])

ab_full = Pipeline([
    ('preprocessing', ab_preprocessing),
    ('classifier', AdaBoostClassifier(
        n_estimators=100,
        learning_rate=1.0,
        algorithm='SAMME',
        estimator=DecisionTreeClassifier(max_depth=3),
        random_state=42
    ))
])

# Train and evaluate SVM
print(f"\nTraining SVM with PCA configuration...")
svm_full.fit(X_train, y_train)
best_svm = svm_full

print("\n=== Individual SVM Performance ===")
y_train_svm = best_svm.predict(X_train)
y_test_svm = best_svm.predict(X_test)
y_train_svm_proba = best_svm.predict_proba(X_train)
y_test_svm_proba = best_svm.predict_proba(X_test)

svm_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_svm),
        metrics.accuracy_score(y_test, y_test_svm),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_svm, average='macro'),
        metrics.f1_score(y_test, y_test_svm, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_svm, average='macro'),
        metrics.recall_score(y_test, y_test_svm, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_svm, average='macro'),
        metrics.precision_score(y_test, y_test_svm, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_svm_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro'),
    ]
}
df_svm_metrics = pd.DataFrame(svm_metrics)
print(df_svm_metrics.to_string(index=False))
storeResults('SVM', 'PCA', 
    metrics.accuracy_score(y_test, y_test_svm),
    metrics.f1_score(y_test, y_test_svm, average='macro'),
    metrics.recall_score(y_test, y_test_svm, average='macro'),
    metrics.precision_score(y_test, y_test_svm, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svm_proba, multi_class='ovr', average='macro')
)

# Train Gradient Boosting
print(f"\nTraining Gradient Boosting with Normalized Data configuration...")
gb_full.fit(X_train, y_train)
best_gb = gb_full

print("\n=== Individual Gradient Boosting Performance ===")
y_train_gb = best_gb.predict(X_train)
y_test_gb = best_gb.predict(X_test)
y_train_gb_proba = best_gb.predict_proba(X_train)
y_test_gb_proba = best_gb.predict_proba(X_test)

gb_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_gb),
        metrics.accuracy_score(y_test, y_test_gb),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_gb, average='macro'),
        metrics.f1_score(y_test, y_test_gb, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_gb, average='macro'),
        metrics.recall_score(y_test, y_test_gb, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_gb, average='macro'),
        metrics.precision_score(y_test, y_test_gb, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_gb_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
    ]
}
df_gb_metrics = pd.DataFrame(gb_metrics)
print(df_gb_metrics.to_string(index=False))
storeResults('Gradient Boosting', 'Normalized Data',
    metrics.accuracy_score(y_test, y_test_gb),
    metrics.f1_score(y_test, y_test_gb, average='macro'),
    metrics.recall_score(y_test, y_test_gb, average='macro'),
    metrics.precision_score(y_test, y_test_gb, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
)

# Train AdaBoost
print(f"\nTraining AdaBoost with Original Data configuration...")
ab_full.fit(X_train, y_train)
best_ab = ab_full

print("\n=== Individual AdaBoost Performance ===")
y_train_ab = best_ab.predict(X_train)
y_test_ab = best_ab.predict(X_test)
y_train_ab_proba = best_ab.predict_proba(X_train)
y_test_ab_proba = best_ab.predict_proba(X_test)

ab_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_ab),
        metrics.accuracy_score(y_test, y_test_ab),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_ab, average='macro'),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_ab_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
    ]
}
df_ab_metrics = pd.DataFrame(ab_metrics)
print(df_ab_metrics.to_string(index=False))
storeResults('AdaBoost', 'Original Data',
    metrics.accuracy_score(y_test, y_test_ab),
    metrics.f1_score(y_test, y_test_ab, average='macro'),
    metrics.recall_score(y_test, y_test_ab, average='macro'),
    metrics.precision_score(y_test, y_test_ab, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
)

# Create Stacking Classifier
print("\n=== Creating Stacking Classifier ===")

# Use scikit-learn's StackingClassifier (prevents data leakage)
stacking_clf = StackingClassifier(
    estimators=[
        ('svm', best_svm),
        ('gb', best_gb),
        ('ab', best_ab)
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

print("\nTraining stacking classifier...")
stacking_clf.fit(X_train, y_train)

# Get final stacking predictions
y_train_stack_pred = stacking_clf.predict(X_train)
y_test_stack_pred = stacking_clf.predict(X_test)
y_train_stack_proba = stacking_clf.predict_proba(X_train)
y_test_stack_proba = stacking_clf.predict_proba(X_test)

# Calculate stacking metrics
print("\n=== Stacking Classifier Performance ===")
stack_metrics = {
    "Dataset": ["Training", "Test"],
    "Accuracy": [
        metrics.accuracy_score(y_train, y_train_stack_pred),
        metrics.accuracy_score(y_test, y_test_stack_pred),
    ],
    "F1 Score": [
        metrics.f1_score(y_train, y_train_stack_pred, average='macro'),
        metrics.f1_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "Recall": [
        metrics.recall_score(y_train, y_train_stack_pred, average='macro'),
        metrics.recall_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "Precision": [
        metrics.precision_score(y_train, y_train_stack_pred, average='macro'),
        metrics.precision_score(y_test, y_test_stack_pred, average='macro'),
    ],
    "AUC-ROC": [
        metrics.roc_auc_score(pd.get_dummies(y_train), y_train_stack_proba, multi_class='ovr', average='macro'),
        metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_proba, multi_class='ovr', average='macro'),
    ]
}

df_stack_metrics = pd.DataFrame(stack_metrics)
print("\nStacking Classifier Performance Metrics")
print(df_stack_metrics.to_string(index=False))

storeResults(
    'Stacking Classifier',
    'Mixed Preprocessing',
    metrics.accuracy_score(y_test, y_test_stack_pred),
    metrics.f1_score(y_test, y_test_stack_pred, average='macro'),
    metrics.recall_score(y_test, y_test_stack_pred, average='macro'),
    metrics.precision_score(y_test, y_test_stack_pred, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_proba, multi_class='ovr', average='macro')
)

# Also try with different meta-learners
print("\n=== Trying Alternative Meta-Learners ===")

# Try with Random Forest as meta-learner
stacking_rf = StackingClassifier(
    estimators=[
        ('svm', best_svm),
        ('gb', best_gb),
        ('ab', best_ab)
    ],
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

stacking_rf.fit(X_train, y_train)
y_test_stack_rf_pred = stacking_rf.predict(X_test)
y_test_stack_rf_proba = stacking_rf.predict_proba(X_test)

print("\nStacking with Random Forest Meta-Learner:")
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_test_stack_rf_pred):.4f}")
print(f"Test F1 Score: {metrics.f1_score(y_test, y_test_stack_rf_pred, average='macro'):.4f}")
print(f"Test AUC-ROC: {metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_rf_proba, multi_class='ovr', average='macro'):.4f}")

storeResults(
    'Stacking Classifier (RF Meta)',
    'Mixed Preprocessing',
    metrics.accuracy_score(y_test, y_test_stack_rf_pred),
    metrics.f1_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.recall_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.precision_score(y_test, y_test_stack_rf_pred, average='macro'),
    metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stack_rf_proba, multi_class='ovr', average='macro')
)


=== SVM Feature Selection Pipeline ===
Optimal number of features to select using SelectKBest for SVM: 24
Optimal number of features selected by RFECV for SVM: 24
Number of components that explain 90.0% variance: 19

=== Training Base Models ===

Training SVM with PCA configuration...

=== Individual SVM Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.992089  0.969245 0.944444   0.997131 0.999593
    Test  0.990521  0.964940 0.939394   0.996564 1.000000

Storing results for SVM (PCA):
  Accuracy: 0.9905
  F1-Score: 0.9649
  Recall: 0.9394
  Precision: 0.9966
  AUC-ROC: 1.0000

Training Gradient Boosting with Normalized Data configuration...

=== Individual Gradient Boosting Performance ===
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.962085  0.820578 0.734848   0.986667 0.990107

Storing results for Gradient Boosting (Normalized Data):
  Accuracy: 0.9621
  F1-Score:

In [8]:
# Dataset Diagnostic Code - Find out why you're getting unrealistic results
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

def diagnose_dataset(X, y):
    """
    Comprehensive dataset diagnosis to find why results are unrealistic
    """
    print("=" * 60)
    print("DATASET DIAGNOSTIC REPORT")
    print("=" * 60)
    
    # Basic statistics
    print(f"\n1. BASIC DATASET INFO:")
    print(f"   Dataset shape: {X.shape}")
    print(f"   Number of samples: {X.shape[0]}")
    print(f"   Number of features: {X.shape[1]}")
    print(f"   Number of classes: {len(np.unique(y))}")
    print(f"   Class labels: {np.unique(y)}")
    
    # Check class distribution
    class_counts = np.bincount(y)
    class_percentages = class_counts / len(y) * 100
    print(f"\n2. CLASS DISTRIBUTION:")
    for i, (count, pct) in enumerate(zip(class_counts, class_percentages)):
        print(f"   Class {i}: {count} samples ({pct:.1f}%)")
    
    # Check for severe class imbalance
    min_class_pct = np.min(class_percentages)
    if min_class_pct < 5:
        print(f"   ⚠️  SEVERE CLASS IMBALANCE: Smallest class = {min_class_pct:.1f}%")
    
    # Samples per feature ratio
    samples_per_feature = X.shape[0] / X.shape[1]
    print(f"\n3. SAMPLES PER FEATURE RATIO:")
    print(f"   Ratio: {samples_per_feature:.2f}")
    if samples_per_feature < 10:
        print(f"   🚨 CRITICAL: Too few samples per feature! Need >10, have {samples_per_feature:.2f}")
        print(f"   This WILL cause overfitting!")
    
    # Check for perfect separability with simple model
    print(f"\n4. SEPARABILITY TEST:")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Test with simple logistic regression
    simple_model = LogisticRegression(max_iter=1000)
    simple_model.fit(X_train, y_train)
    simple_acc = simple_model.score(X_test, y_test)
    print(f"   Simple Logistic Regression accuracy: {simple_acc:.4f}")
    
    if simple_acc > 0.95:
        print(f"   🚨 PROBLEM: Even simple model gets >95% accuracy!")
        print(f"   This suggests the dataset is artificially easy")
    
    # Check for duplicate rows
    print(f"\n5. DATA QUALITY CHECKS:")
    if hasattr(X, 'duplicated'):
        duplicates = X.duplicated().sum()
    else:
        df_temp = pd.DataFrame(X)
        duplicates = df_temp.duplicated().sum()
    print(f"   Duplicate rows: {duplicates}")
    
    # Check for constant features
    if hasattr(X, 'nunique'):
        constant_features = (X.nunique() <= 1).sum()
    else:
        constant_features = np.sum([len(np.unique(X[:, i])) <= 1 for i in range(X.shape[1])])
    print(f"   Constant features: {constant_features}")
    
    # Check feature variance
    if hasattr(X, 'values'):
        X_array = X.values
    else:
        X_array = X
    
    feature_vars = np.var(X_array, axis=0)
    zero_var_features = np.sum(feature_vars == 0)
    very_low_var = np.sum(feature_vars < 0.01)
    print(f"   Zero variance features: {zero_var_features}")
    print(f"   Very low variance features: {very_low_var}")
    
    # Check correlation between features and target
    print(f"\n6. FEATURE-TARGET CORRELATION:")
    if hasattr(X, 'corrwith'):
        correlations = []
        for i in range(X.shape[1]):
            corr = np.corrcoef(X.iloc[:, i], y)[0, 1]
            correlations.append(abs(corr))
    else:
        correlations = []
        for i in range(X.shape[1]):
            corr = np.corrcoef(X_array[:, i], y)[0, 1]
            if not np.isnan(corr):
                correlations.append(abs(corr))
    
    if correlations:
        max_corr = np.max(correlations)
        high_corr_features = np.sum(np.array(correlations) > 0.8)
        print(f"   Maximum feature-target correlation: {max_corr:.4f}")
        print(f"   Features with >0.8 correlation: {high_corr_features}")
        
        if max_corr > 0.9:
            print(f"   🚨 PROBLEM: Feature with >0.9 correlation! Almost perfect predictor!")
        elif high_corr_features > 3:
            print(f"   ⚠️  WARNING: {high_corr_features} features with very high correlation")
    
    # Test random baseline
    print(f"\n7. RANDOM BASELINE TEST:")
    np.random.seed(42)
    random_predictions = np.random.choice(np.unique(y), size=len(y_test))
    random_acc = accuracy_score(y_test, random_predictions)
    expected_random = 1.0 / len(np.unique(y))
    print(f"   Random prediction accuracy: {random_acc:.4f}")
    print(f"   Expected random accuracy: {expected_random:.4f}")
    
    # Feature importance check
    print(f"\n8. FEATURE IMPORTANCE CHECK:")
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    feature_importance = rf.feature_importances_
    
    top_feature_importance = np.max(feature_importance)
    dominant_features = np.sum(feature_importance > 0.1)
    
    print(f"   Highest feature importance: {top_feature_importance:.4f}")
    print(f"   Features with >10% importance: {dominant_features}")
    
    if top_feature_importance > 0.5:
        print(f"   🚨 PROBLEM: One feature dominates (importance = {top_feature_importance:.4f})")
        print(f"   This suggests artificial/synthetic data or data leakage")
    
    # Overall assessment
    print(f"\n" + "=" * 60)
    print("OVERALL ASSESSMENT:")
    
    problems = []
    if samples_per_feature < 10:
        problems.append("Too few samples per feature")
    if simple_acc > 0.95:
        problems.append("Artificially easy classification task")
    if max_corr > 0.9:
        problems.append("Perfect predictor feature exists")
    if top_feature_importance > 0.5:
        problems.append("Single feature dominates")
    if zero_var_features > 0:
        problems.append("Constant features present")
    
    if problems:
        print("🚨 MAJOR ISSUES FOUND:")
        for problem in problems:
            print(f"   • {problem}")
        print("\nRECOMMENDATION: Your dataset has fundamental issues.")
        print("The high accuracy is NOT due to good model performance,")
        print("but due to dataset problems. Consider:")
        print("   1. Getting a larger, more realistic dataset")
        print("   2. Removing constant/dominant features") 
        print("   3. Checking for data leakage in data collection")
        print("   4. Using a more challenging benchmark dataset")
    else:
        print("✅ Dataset appears reasonable")
        print("High accuracy might be legitimate")
    
    print("=" * 60)

# Usage:
# diagnose_dataset(X, y)

# Additional function to check if dataset is synthetic/artificial
def check_if_synthetic(X, y):
    """
    Check if dataset appears to be artificially generated
    """
    print("\n" + "=" * 40)
    print("SYNTHETIC DATA CHECK")
    print("=" * 40)
    
    # Check for artificial patterns
    if hasattr(X, 'values'):
        X_array = X.values
    else:
        X_array = X
    
    # Check for too-perfect distributions
    feature_means = np.mean(X_array, axis=0)
    feature_stds = np.std(X_array, axis=0)
    
    # Check if all features have similar scales (sign of artificial data)
    if np.all(feature_means >= 0) and np.all(feature_means <= 1) and np.all(feature_stds < 0.5):
        print("⚠️  All features in [0,1] range with low variance - possibly artificial")
    
    # Check for integer-only values (common in synthetic datasets)
    integer_features = 0
    for i in range(X_array.shape[1]):
        if np.allclose(X_array[:, i], X_array[:, i].astype(int)):
            integer_features += 1
    
    integer_pct = integer_features / X_array.shape[1] * 100
    print(f"Features with only integer values: {integer_features}/{X_array.shape[1]} ({integer_pct:.1f}%)")
    
    if integer_pct > 80:
        print("🚨 SUSPICIOUS: >80% features are integer-only (sign of synthetic data)")
    
    # Check class separability in 2D
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_array)
    
    # Visual check would require plotting, but we can check variance explained
    variance_explained = np.sum(pca.explained_variance_ratio_)
    print(f"Variance explained by first 2 PCA components: {variance_explained:.4f}")
    
    if variance_explained > 0.95:
        print("🚨 PROBLEM: 95%+ variance in just 2 components - artificially simple data")

# Run both diagnostics
print("Run these commands:")
print("diagnose_dataset(X, y)")
print("check_if_synthetic(X, y)")

Run these commands:
diagnose_dataset(X, y)
check_if_synthetic(X, y)


In [9]:
diagnose_dataset(X, y)
check_if_synthetic(X, y)

DATASET DIAGNOSTIC REPORT

1. BASIC DATASET INFO:
   Dataset shape: (843, 25)
   Number of samples: 843
   Number of features: 25
   Number of classes: 3
   Class labels: [0 1 2]

2. CLASS DISTRIBUTION:
   Class 0: 32 samples (3.8%)
   Class 1: 768 samples (91.1%)
   Class 2: 43 samples (5.1%)
   ⚠️  SEVERE CLASS IMBALANCE: Smallest class = 3.8%

3. SAMPLES PER FEATURE RATIO:
   Ratio: 33.72

4. SEPARABILITY TEST:
   Simple Logistic Regression accuracy: 0.9882
   🚨 PROBLEM: Even simple model gets >95% accuracy!
   This suggests the dataset is artificially easy

5. DATA QUALITY CHECKS:
   Duplicate rows: 27
   Constant features: 0
   Zero variance features: 0
   Very low variance features: 0

6. FEATURE-TARGET CORRELATION:
   Maximum feature-target correlation: 0.2966
   Features with >0.8 correlation: 0

7. RANDOM BASELINE TEST:
   Random prediction accuracy: 0.3018
   Expected random accuracy: 0.3333

8. FEATURE IMPORTANCE CHECK:
   Highest feature importance: 0.0602
   Features with 

---

# END