In [1]:
import pickle
import pandas as pd
import numpy as np
from smote_sample import X, y
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

Disease
Atherosclerosis                         152809
Hypertension                            115644
Cardiovascular Disease (CVD)             99122
Chronic Fatigue Syndrome (CFS)           53545
Respiratory Disease (COPD or Asthma)     28039
Stress-related Disorders                   352
Arrhythmias                                284
Healthy                                     67
Autonomic Dysfunction                       65
Diabetes                                    48
Anaemia                                     25
Name: count, dtype: int64


<h5 style="color: SkyBlue;">Load Dataset</h5>

In [2]:
df = pd.read_csv('user_data_for_disease_prediction - unclassified data set.csv')
print(df.head())

   Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
0                 0       -0.9        0.4        49.7      67.9   
1     

In [3]:
with open("balanced_data.pkl", "rb") as f:
    X_balanced, y_balanced = pickle.load(f)

print("Loaded Feature Shape:", X_balanced.shape)
print("Loaded Target Shape:", y_balanced.shape)
print("Loaded Distribution after SMOTE:\n", y_balanced.value_counts())

Loaded Feature Shape: (1680899, 17)
Loaded Target Shape: (1680899,)
Loaded Distribution after SMOTE:
 Disease
Chronic Fatigue Syndrome (CFS)          152809
Atherosclerosis                         152809
Hypertension                            152809
Cardiovascular Disease (CVD)            152809
Respiratory Disease (COPD or Asthma)    152809
Autonomic Dysfunction                   152809
Arrhythmias                             152809
Anaemia                                 152809
Stress-related Disorders                152809
Diabetes                                152809
Healthy                                 152809
Name: count, dtype: int64


<h5 style="color: SkyBlue;">Stratify Sampling</h5>

In [4]:
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

training_sample_sizes = [max(size - size // 5, 1) for size in sample_sizes]  
testing_sample_sizes = [size - train_size for size, train_size in zip(sample_sizes, training_sample_sizes)]  

print("Training sample sizes:", training_sample_sizes)
print("Testing sample sizes:", testing_sample_sizes)

Training sample sizes: [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
Testing sample sizes: [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]


<h5 style="color: SkyBlue;">Logistic Regression</h5>

In [5]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):

    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices, 
        target_names=class_labels, 
        zero_division=0,  
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Logistic Regression Model
    log_reg = LogisticRegression(
        max_iter=200, class_weight='balanced', solver='saga', random_state=42
    )
    log_reg.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        log_reg, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
         y_test_expanded,
         predictions,
         labels=np.arange(len(class_labels)),  # Include all class indices
         target_names=class_labels,  # Use the full set of class labels
         zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = log_reg
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Logistic Regression with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       0.0
                         Arrhythmias       0.00      0.00      0.00       0.0
                     Atherosclerosis       0.00      0.00      0.00       0.0
               Autonomic Dysfunction       0.00      0.00      0.00      11.0
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       0.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                            Diabetes       0.00      0.00      0.00       0.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0
            Stress-related Disorders       0.00      0.00      0.00       0.0

                           




Sample size 750 - Accuracy: 0.5133
                                      precision    recall  f1-score   support

                             Anaemia       0.50      0.53      0.51       187
                         Arrhythmias       0.46      0.67      0.55        99
                     Atherosclerosis       0.67      0.55      0.60       121
               Autonomic Dysfunction       0.55      0.32      0.40       209
        Cardiovascular Disease (CVD)       0.64      0.58      0.61       132
      Chronic Fatigue Syndrome (CFS)       0.43      0.50      0.46       132
                            Diabetes       0.46      0.60      0.52       110
                             Healthy       0.24      0.50      0.32       110
                        Hypertension       0.71      0.56      0.62       198
Respiratory Disease (COPD or Asthma)       0.55      0.33      0.41       198
            Stress-related Disorders       0.67      0.71      0.69       154

                          




Sample size 2500 - Accuracy: 0.6000
                                      precision    recall  f1-score   support

                             Anaemia       0.54      0.72      0.62       517
                         Arrhythmias       0.63      0.62      0.63       462
                     Atherosclerosis       0.71      0.58      0.64       605
               Autonomic Dysfunction       0.37      0.24      0.29       495
        Cardiovascular Disease (CVD)       0.69      0.66      0.67       517
      Chronic Fatigue Syndrome (CFS)       0.58      0.57      0.58       539
                            Diabetes       0.49      0.47      0.48       396
                             Healthy       0.57      0.82      0.67       484
                        Hypertension       0.78      0.72      0.75       583
Respiratory Disease (COPD or Asthma)       0.53      0.43      0.47       462
            Stress-related Disorders       0.62      0.72      0.67       440

                         




Sample size 5000 - Accuracy: 0.6440
                                      precision    recall  f1-score   support

                             Anaemia       0.64      0.76      0.69       968
                         Arrhythmias       0.66      0.72      0.69       891
                     Atherosclerosis       0.63      0.60      0.61      1089
               Autonomic Dysfunction       0.45      0.35      0.40      1034
        Cardiovascular Disease (CVD)       0.76      0.67      0.71      1122
      Chronic Fatigue Syndrome (CFS)       0.61      0.64      0.62      1089
                            Diabetes       0.51      0.56      0.53       792
                             Healthy       0.66      0.78      0.72      1012
                        Hypertension       0.79      0.74      0.76      1133
Respiratory Disease (COPD or Asthma)       0.60      0.57      0.58       869
            Stress-related Disorders       0.71      0.69      0.70      1001

                         




Sample size 7500 - Accuracy: 0.6607
                                      precision    recall  f1-score   support

                             Anaemia       0.67      0.76      0.71      1463
                         Arrhythmias       0.70      0.77      0.73      1375
                     Atherosclerosis       0.62      0.61      0.62      1507
               Autonomic Dysfunction       0.49      0.37      0.42      1507
        Cardiovascular Disease (CVD)       0.78      0.70      0.74      1573
      Chronic Fatigue Syndrome (CFS)       0.63      0.63      0.63      1650
                            Diabetes       0.56      0.55      0.56      1254
                             Healthy       0.65      0.83      0.73      1628
                        Hypertension       0.79      0.75      0.77      1639
Respiratory Disease (COPD or Asthma)       0.62      0.55      0.58      1408
            Stress-related Disorders       0.72      0.71      0.71      1496

                         




Sample size 10000 - Accuracy: 0.6645
                                      precision    recall  f1-score   support

                             Anaemia       0.64      0.76      0.69      1914
                         Arrhythmias       0.74      0.79      0.77      1980
                     Atherosclerosis       0.63      0.63      0.63      1936
               Autonomic Dysfunction       0.52      0.39      0.44      2057
        Cardiovascular Disease (CVD)       0.78      0.71      0.75      2068
      Chronic Fatigue Syndrome (CFS)       0.63      0.65      0.64      2211
                            Diabetes       0.55      0.51      0.53      1606
                             Healthy       0.65      0.83      0.73      2101
                        Hypertension       0.76      0.75      0.75      2145
Respiratory Disease (COPD or Asthma)       0.62      0.55      0.58      1892
            Stress-related Disorders       0.73      0.70      0.72      2090

                        




Sample size 20000 - Accuracy: 0.6860
                                      precision    recall  f1-score   support

                             Anaemia       0.66      0.75      0.70      3762
                         Arrhythmias       0.75      0.82      0.78      4059
                     Atherosclerosis       0.64      0.62      0.63      3883
               Autonomic Dysfunction       0.51      0.41      0.46      3806
        Cardiovascular Disease (CVD)       0.78      0.75      0.77      4290
      Chronic Fatigue Syndrome (CFS)       0.66      0.71      0.68      4169
                            Diabetes       0.66      0.55      0.60      3641
                             Healthy       0.69      0.84      0.76      4345
                        Hypertension       0.77      0.76      0.76      4268
Respiratory Disease (COPD or Asthma)       0.63      0.58      0.60      3773
            Stress-related Disorders       0.72      0.70      0.71      4004

                        




Sample size 30000 - Accuracy: 0.6998
                                      precision    recall  f1-score   support

                             Anaemia       0.68      0.77      0.72      5731
                         Arrhythmias       0.76      0.81      0.78      6226
                     Atherosclerosis       0.65      0.65      0.65      5753
               Autonomic Dysfunction       0.54      0.43      0.48      5962
        Cardiovascular Disease (CVD)       0.79      0.78      0.78      6204
      Chronic Fatigue Syndrome (CFS)       0.68      0.75      0.71      6039
                            Diabetes       0.68      0.57      0.62      5698
                             Healthy       0.70      0.87      0.78      6358
                        Hypertension       0.76      0.75      0.76      6171
Respiratory Disease (COPD or Asthma)       0.67      0.59      0.63      5797
            Stress-related Disorders       0.73      0.70      0.72      6061

                        




Sample size 40000 - Accuracy: 0.7096
                                      precision    recall  f1-score   support

                             Anaemia       0.69      0.79      0.74      8019
                         Arrhythmias       0.75      0.80      0.78      8217
                     Atherosclerosis       0.67      0.65      0.66      7986
               Autonomic Dysfunction       0.58      0.47      0.52      8052
        Cardiovascular Disease (CVD)       0.78      0.78      0.78      8283
      Chronic Fatigue Syndrome (CFS)       0.70      0.75      0.73      8052
                            Diabetes       0.69      0.58      0.63      7711
                             Healthy       0.71      0.88      0.79      8162
                        Hypertension       0.76      0.75      0.76      8063
Respiratory Disease (COPD or Asthma)       0.67      0.59      0.63      7568
            Stress-related Disorders       0.74      0.72      0.73      7887

                        



<h5 style="color: SkyBlue;">Decision Tree</h5>

In [6]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Decision Tree Model
    decision_tree = DecisionTreeClassifier(
        class_weight='balanced', random_state=42
    )
    decision_tree.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        decision_tree, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = decision_tree
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Decision Tree with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.2000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00         0
                         Arrhythmias       0.00      0.00      0.00         0
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       1.00      1.00      1.00        11
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       0.00      0.00      0.00         0
                             Healthy       0.00      0.00      0.00        11
                        Hypertension       0.00      0.00      0.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        11
            Stress-related Disorders       0.00      0.00      0.00         0

                           




Sample size 7500 - Accuracy: 0.9913
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1463
                         Arrhythmias       1.00      0.98      0.99      1375
                     Atherosclerosis       1.00      1.00      1.00      1507
               Autonomic Dysfunction       1.00      1.00      1.00      1507
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      1573
      Chronic Fatigue Syndrome (CFS)       0.99      0.99      0.99      1650
                            Diabetes       1.00      1.00      1.00      1254
                             Healthy       1.00      1.00      1.00      1628
                        Hypertension       0.97      0.97      0.97      1639
Respiratory Disease (COPD or Asthma)       0.96      0.96      0.96      1408
            Stress-related Disorders       0.99      1.00      1.00      1496

                         




Sample size 20000 - Accuracy: 0.9978
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      3762
                         Arrhythmias       1.00      1.00      1.00      4059
                     Atherosclerosis       1.00      1.00      1.00      3883
               Autonomic Dysfunction       1.00      1.00      1.00      3806
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      4290
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      4169
                            Diabetes       1.00      1.00      1.00      3641
                             Healthy       1.00      0.99      1.00      4345
                        Hypertension       0.99      0.99      0.99      4268
Respiratory Disease (COPD or Asthma)       0.99      1.00      1.00      3773
            Stress-related Disorders       1.00      1.00      1.00      4004

                        




Sample size 30000 - Accuracy: 0.9972
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      5731
                         Arrhythmias       1.00      1.00      1.00      6226
                     Atherosclerosis       1.00      1.00      1.00      5753
               Autonomic Dysfunction       1.00      1.00      1.00      5962
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      6204
      Chronic Fatigue Syndrome (CFS)       0.99      0.99      0.99      6039
                            Diabetes       1.00      1.00      1.00      5698
                             Healthy       1.00      1.00      1.00      6358
                        Hypertension       0.99      0.99      0.99      6171
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99      5797
            Stress-related Disorders       1.00      1.00      1.00      6061

                        




Sample size 40000 - Accuracy: 0.9978
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      8019
                         Arrhythmias       1.00      1.00      1.00      8217
                     Atherosclerosis       1.00      1.00      1.00      7986
               Autonomic Dysfunction       1.00      1.00      1.00      8052
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      8283
      Chronic Fatigue Syndrome (CFS)       0.99      0.99      0.99      8052
                            Diabetes       1.00      1.00      1.00      7711
                             Healthy       1.00      1.00      1.00      8162
                        Hypertension       0.99      0.99      0.99      8063
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99      7568
            Stress-related Disorders       1.00      1.00      1.00      7887

                        



<h5 style="color: SkyBlue;">Random Forest</h5>

In [7]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Random Forest Model
    random_forest = RandomForestClassifier(
        class_weight='balanced', random_state=42, n_jobs=-1
    )
    random_forest.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        random_forest, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = random_forest
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Random Forest with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       0.0
                         Arrhythmias       0.00      0.00      0.00       0.0
                     Atherosclerosis       0.00      0.00      0.00       0.0
               Autonomic Dysfunction       0.00      0.00      0.00      11.0
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       0.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                            Diabetes       0.00      0.00      0.00       0.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0
            Stress-related Disorders       0.00      0.00      0.00       0.0

                           




Sample size 75 - Accuracy: 0.5333
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        11
                         Arrhythmias       1.00      1.00      1.00        11
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.20      0.50      0.29        22
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       1.00      1.00      1.00        11
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       1.00      0.67      0.80        33
            Stress-related Disorders       0.50      1.00      0.67        22

                           




Sample size 250 - Accuracy: 0.9400
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        33
                         Arrhythmias       0.75      1.00      0.86        33
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       1.00      1.00      1.00        88
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        44
      Chronic Fatigue Syndrome (CFS)       1.00      0.83      0.91        66
                            Diabetes       1.00      1.00      1.00        55
                             Healthy       1.00      1.00      1.00        55
                        Hypertension       1.00      0.83      0.91        66
Respiratory Disease (COPD or Asthma)       1.00      0.75      0.86        44
            Stress-related Disorders       0.75      1.00      0.86        66

                          




Sample size 750 - Accuracy: 0.9533
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       187
                         Arrhythmias       1.00      1.00      1.00        99
                     Atherosclerosis       0.90      0.82      0.86       121
               Autonomic Dysfunction       0.95      1.00      0.97       209
        Cardiovascular Disease (CVD)       1.00      0.92      0.96       132
      Chronic Fatigue Syndrome (CFS)       1.00      0.92      0.96       132
                            Diabetes       1.00      1.00      1.00       110
                             Healthy       1.00      1.00      1.00       110
                        Hypertension       0.94      0.89      0.91       198
Respiratory Disease (COPD or Asthma)       0.81      0.94      0.87       198
            Stress-related Disorders       1.00      1.00      1.00       154

                          




Sample size 2500 - Accuracy: 0.9900
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       517
                         Arrhythmias       1.00      1.00      1.00       462
                     Atherosclerosis       0.98      0.98      0.98       605
               Autonomic Dysfunction       1.00      1.00      1.00       495
        Cardiovascular Disease (CVD)       1.00      0.98      0.99       517
      Chronic Fatigue Syndrome (CFS)       1.00      0.94      0.97       539
                            Diabetes       1.00      1.00      1.00       396
                             Healthy       1.00      1.00      1.00       484
                        Hypertension       0.98      1.00      0.99       583
Respiratory Disease (COPD or Asthma)       0.93      1.00      0.97       462
            Stress-related Disorders       1.00      1.00      1.00       440

                         




Sample size 5000 - Accuracy: 0.9960
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       968
                         Arrhythmias       0.99      1.00      0.99       891
                     Atherosclerosis       0.99      0.99      0.99      1089
               Autonomic Dysfunction       1.00      1.00      1.00      1034
        Cardiovascular Disease (CVD)       1.00      0.99      1.00      1122
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      1089
                            Diabetes       1.00      1.00      1.00       792
                             Healthy       1.00      1.00      1.00      1012
                        Hypertension       0.99      1.00      1.00      1133
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99       869
            Stress-related Disorders       1.00      1.00      1.00      1001

                         




Sample size 7500 - Accuracy: 0.9973
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1463
                         Arrhythmias       1.00      1.00      1.00      1375
                     Atherosclerosis       0.99      0.99      0.99      1507
               Autonomic Dysfunction       1.00      1.00      1.00      1507
        Cardiovascular Disease (CVD)       1.00      0.99      1.00      1573
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      1650
                            Diabetes       1.00      1.00      1.00      1254
                             Healthy       1.00      1.00      1.00      1628
                        Hypertension       0.99      0.99      0.99      1639
Respiratory Disease (COPD or Asthma)       0.99      1.00      1.00      1408
            Stress-related Disorders       0.99      1.00      1.00      1496

                         




Sample size 10000 - Accuracy: 0.9975
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1914
                         Arrhythmias       1.00      1.00      1.00      1980
                     Atherosclerosis       0.99      1.00      0.99      1936
               Autonomic Dysfunction       1.00      1.00      1.00      2057
        Cardiovascular Disease (CVD)       1.00      0.99      0.99      2068
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      2211
                            Diabetes       1.00      1.00      1.00      1606
                             Healthy       1.00      1.00      1.00      2101
                        Hypertension       0.99      0.99      0.99      2145
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99      1892
            Stress-related Disorders       0.99      1.00      1.00      2090

                        




Sample size 20000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      3762
                         Arrhythmias       1.00      1.00      1.00      4059
                     Atherosclerosis       1.00      1.00      1.00      3883
               Autonomic Dysfunction       1.00      1.00      1.00      3806
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      4290
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      4169
                            Diabetes       1.00      1.00      1.00      3641
                             Healthy       1.00      1.00      1.00      4345
                        Hypertension       1.00      1.00      1.00      4268
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      3773
            Stress-related Disorders       1.00      1.00      1.00      4004

                        




Sample size 30000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      5731
                         Arrhythmias       1.00      1.00      1.00      6226
                     Atherosclerosis       1.00      1.00      1.00      5753
               Autonomic Dysfunction       1.00      1.00      1.00      5962
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      6204
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      6039
                            Diabetes       1.00      1.00      1.00      5698
                             Healthy       1.00      1.00      1.00      6358
                        Hypertension       1.00      1.00      1.00      6171
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      5797
            Stress-related Disorders       1.00      1.00      1.00      6061

                        




Sample size 40000 - Accuracy: 0.9986
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      8019
                         Arrhythmias       1.00      1.00      1.00      8217
                     Atherosclerosis       1.00      1.00      1.00      7986
               Autonomic Dysfunction       1.00      1.00      1.00      8052
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      8283
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      8052
                            Diabetes       1.00      1.00      1.00      7711
                             Healthy       1.00      1.00      1.00      8162
                        Hypertension       1.00      1.00      1.00      8063
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      7568
            Stress-related Disorders       1.00      1.00      1.00      7887

                        




Sample size 50000 - Accuracy: 0.9989
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      9856
                         Arrhythmias       1.00      1.00      1.00     10296
                     Atherosclerosis       1.00      1.00      1.00     10109
               Autonomic Dysfunction       1.00      1.00      1.00     10329
        Cardiovascular Disease (CVD)       1.00      1.00      1.00     10153
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      9966
                            Diabetes       1.00      1.00      1.00      9625
                             Healthy       1.00      1.00      1.00     10197
                        Hypertension       1.00      1.00      1.00      9889
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      9537
            Stress-related Disorders       1.00      1.00      1.00     10043

                        

<h5 style="color: SkyBlue;">Gradient Boosting (GBM)</h5>

In [8]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Gradient Boosting Model (GBM)
    gbm = GradientBoostingClassifier(
        n_estimators=100, random_state=42, max_depth=3
    )
    gbm.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        gbm, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = gbm
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Gradient Boosting with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       0.0
                         Arrhythmias       0.00      0.00      0.00       0.0
                     Atherosclerosis       0.00      0.00      0.00       0.0
               Autonomic Dysfunction       0.00      0.00      0.00      11.0
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       0.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                            Diabetes       0.00      0.00      0.00       0.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0
            Stress-related Disorders       0.00      0.00      0.00       0.0

                           




Sample size 50 - Accuracy: 0.5000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00         0
                         Arrhythmias       1.00      1.00      1.00        11
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.50      1.00      0.67        11
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       1.00      1.00      1.00        11
                             Healthy       1.00      0.50      0.67        22
                        Hypertension       0.00      0.00      0.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        22
            Stress-related Disorders       0.33      1.00      0.50        11

                           




Sample size 75 - Accuracy: 0.4667
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        11
                         Arrhythmias       1.00      1.00      1.00        11
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.33      0.50      0.40        22
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       1.00      1.00      1.00        11
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.50      0.33      0.40        33
            Stress-related Disorders       0.40      1.00      0.57        22

                           




Sample size 100 - Accuracy: 0.4500
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        33
                         Arrhythmias       1.00      1.00      1.00        11
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.50      0.50      0.50        22
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.33      0.50      0.40        22
                            Diabetes       1.00      1.00      1.00        22
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.50      0.25      0.33        44
            Stress-related Disorders       0.25      1.00      0.40        22

                          




Sample size 250 - Accuracy: 0.8800
                                      precision    recall  f1-score   support

                             Anaemia       0.50      1.00      0.67        33
                         Arrhythmias       1.00      1.00      1.00        33
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.83      0.62      0.71        88
        Cardiovascular Disease (CVD)       1.00      0.75      0.86        44
      Chronic Fatigue Syndrome (CFS)       1.00      0.83      0.91        66
                            Diabetes       1.00      1.00      1.00        55
                             Healthy       1.00      1.00      1.00        55
                        Hypertension       1.00      0.83      0.91        66
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00        44
            Stress-related Disorders       0.75      1.00      0.86        66

                          




Sample size 500 - Accuracy: 0.9500
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       110
                         Arrhythmias       1.00      1.00      1.00        66
                     Atherosclerosis       1.00      0.33      0.50        33
               Autonomic Dysfunction       0.93      1.00      0.96       143
        Cardiovascular Disease (CVD)       1.00      0.86      0.92        77
      Chronic Fatigue Syndrome (CFS)       1.00      0.91      0.95       121
                            Diabetes       1.00      1.00      1.00        88
                             Healthy       1.00      1.00      1.00        66
                        Hypertension       0.93      1.00      0.97       154
Respiratory Disease (COPD or Asthma)       0.83      0.91      0.87       121
            Stress-related Disorders       0.92      1.00      0.96       121

                          




Sample size 750 - Accuracy: 0.9533
                                      precision    recall  f1-score   support

                             Anaemia       0.89      1.00      0.94       187
                         Arrhythmias       1.00      1.00      1.00        99
                     Atherosclerosis       1.00      0.82      0.90       121
               Autonomic Dysfunction       0.89      0.89      0.89       209
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       132
      Chronic Fatigue Syndrome (CFS)       1.00      0.92      0.96       132
                            Diabetes       1.00      1.00      1.00       110
                             Healthy       1.00      1.00      1.00       110
                        Hypertension       1.00      0.94      0.97       198
Respiratory Disease (COPD or Asthma)       0.89      0.94      0.92       198
            Stress-related Disorders       0.93      1.00      0.97       154

                          




Sample size 1000 - Accuracy: 0.9650
                                      precision    recall  f1-score   support

                             Anaemia       0.91      1.00      0.95       231
                         Arrhythmias       1.00      1.00      1.00       154
                     Atherosclerosis       1.00      1.00      1.00       165
               Autonomic Dysfunction       0.92      0.92      0.92       264
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       187
      Chronic Fatigue Syndrome (CFS)       1.00      0.90      0.95       220
                            Diabetes       1.00      1.00      1.00       132
                             Healthy       1.00      1.00      1.00       154
                        Hypertension       1.00      0.93      0.96       297
Respiratory Disease (COPD or Asthma)       0.94      0.94      0.94       198
            Stress-related Disorders       0.90      1.00      0.95       198

                         




Sample size 2500 - Accuracy: 0.9940
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       517
                         Arrhythmias       0.98      1.00      0.99       462
                     Atherosclerosis       1.00      1.00      1.00       605
               Autonomic Dysfunction       1.00      1.00      1.00       495
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       517
      Chronic Fatigue Syndrome (CFS)       1.00      0.94      0.97       539
                            Diabetes       1.00      1.00      1.00       396
                             Healthy       1.00      1.00      1.00       484
                        Hypertension       1.00      1.00      1.00       583
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00       462
            Stress-related Disorders       0.95      1.00      0.98       440

                         




Sample size 5000 - Accuracy: 0.9960
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       968
                         Arrhythmias       0.98      1.00      0.99       891
                     Atherosclerosis       1.00      0.99      0.99      1089
               Autonomic Dysfunction       1.00      1.00      1.00      1034
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      1122
      Chronic Fatigue Syndrome (CFS)       1.00      0.98      0.99      1089
                            Diabetes       0.99      1.00      0.99       792
                             Healthy       1.00      1.00      1.00      1012
                        Hypertension       1.00      1.00      1.00      1133
Respiratory Disease (COPD or Asthma)       1.00      0.99      0.99       869
            Stress-related Disorders       0.99      1.00      0.99      1001

                         




Sample size 7500 - Accuracy: 0.9980
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1463
                         Arrhythmias       0.98      1.00      0.99      1375
                     Atherosclerosis       1.00      1.00      1.00      1507
               Autonomic Dysfunction       1.00      1.00      1.00      1507
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      1573
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      1650
                            Diabetes       1.00      1.00      1.00      1254
                             Healthy       1.00      1.00      1.00      1628
                        Hypertension       1.00      1.00      1.00      1639
Respiratory Disease (COPD or Asthma)       1.00      0.99      1.00      1408
            Stress-related Disorders       0.99      1.00      1.00      1496

                         




Sample size 10000 - Accuracy: 0.9980
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1914
                         Arrhythmias       0.99      1.00      0.99      1980
                     Atherosclerosis       1.00      1.00      1.00      1936
               Autonomic Dysfunction       1.00      1.00      1.00      2057
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      2068
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      2211
                            Diabetes       0.99      1.00      1.00      1606
                             Healthy       1.00      1.00      1.00      2101
                        Hypertension       1.00      1.00      1.00      2145
Respiratory Disease (COPD or Asthma)       1.00      0.99      0.99      1892
            Stress-related Disorders       0.99      1.00      1.00      2090

                        




Sample size 20000 - Accuracy: 0.9988
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      3762
                         Arrhythmias       1.00      1.00      1.00      4059
                     Atherosclerosis       1.00      1.00      1.00      3883
               Autonomic Dysfunction       1.00      1.00      1.00      3806
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      4290
      Chronic Fatigue Syndrome (CFS)       0.99      0.99      0.99      4169
                            Diabetes       1.00      1.00      1.00      3641
                             Healthy       1.00      1.00      1.00      4345
                        Hypertension       1.00      1.00      1.00      4268
Respiratory Disease (COPD or Asthma)       1.00      0.99      1.00      3773
            Stress-related Disorders       1.00      1.00      1.00      4004

                        




Sample size 30000 - Accuracy: 0.9993
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      5731
                         Arrhythmias       1.00      1.00      1.00      6226
                     Atherosclerosis       1.00      1.00      1.00      5753
               Autonomic Dysfunction       1.00      1.00      1.00      5962
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      6204
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      6039
                            Diabetes       1.00      1.00      1.00      5698
                             Healthy       1.00      1.00      1.00      6358
                        Hypertension       1.00      1.00      1.00      6171
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      5797
            Stress-related Disorders       1.00      1.00      1.00      6061

                        




Sample size 40000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      8019
                         Arrhythmias       1.00      1.00      1.00      8217
                     Atherosclerosis       1.00      1.00      1.00      7986
               Autonomic Dysfunction       1.00      1.00      1.00      8052
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      8283
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      8052
                            Diabetes       1.00      1.00      1.00      7711
                             Healthy       1.00      1.00      1.00      8162
                        Hypertension       1.00      1.00      1.00      8063
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      7568
            Stress-related Disorders       1.00      1.00      1.00      7887

                        




Sample size 50000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      9856
                         Arrhythmias       1.00      1.00      1.00     10296
                     Atherosclerosis       1.00      1.00      1.00     10109
               Autonomic Dysfunction       1.00      1.00      1.00     10329
        Cardiovascular Disease (CVD)       1.00      1.00      1.00     10153
      Chronic Fatigue Syndrome (CFS)       0.99      1.00      1.00      9966
                            Diabetes       1.00      1.00      1.00      9625
                             Healthy       1.00      1.00      1.00     10197
                        Hypertension       1.00      1.00      1.00      9889
Respiratory Disease (COPD or Asthma)       1.00      0.99      1.00      9537
            Stress-related Disorders       1.00      1.00      1.00     10043

                        

<h5 style="color: SkyBlue;">K-Nearest Neighbors (KNN)</h5>

In [9]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # K-Nearest Neighbors Model (KNN)
    knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
    knn.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        knn, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = knn
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: K-Nearest Neighbors with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       0.0
                         Arrhythmias       0.00      0.00      0.00       0.0
                     Atherosclerosis       0.00      0.00      0.00       0.0
               Autonomic Dysfunction       0.00      0.00      0.00      11.0
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       0.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                            Diabetes       0.00      0.00      0.00       0.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0
            Stress-related Disorders       0.00      0.00      0.00       0.0

                           




Sample size 750 - Accuracy: 0.4467
                                      precision    recall  f1-score   support

                             Anaemia       0.55      0.71      0.62       187
                         Arrhythmias       0.33      0.89      0.48        99
                     Atherosclerosis       0.00      0.00      0.00       121
               Autonomic Dysfunction       0.61      0.74      0.67       209
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       132
      Chronic Fatigue Syndrome (CFS)       0.33      0.58      0.42       132
                            Diabetes       0.38      0.50      0.43       110
                             Healthy       0.60      0.90      0.72       110
                        Hypertension       0.20      0.06      0.09       198
Respiratory Disease (COPD or Asthma)       0.50      0.06      0.10       198
            Stress-related Disorders       0.48      0.71      0.57       154

                          




Sample size 7500 - Accuracy: 0.7167
                                      precision    recall  f1-score   support

                             Anaemia       0.86      0.95      0.90      1463
                         Arrhythmias       0.59      0.95      0.73      1375
                     Atherosclerosis       0.53      0.48      0.51      1507
               Autonomic Dysfunction       0.81      0.97      0.88      1507
        Cardiovascular Disease (CVD)       0.62      0.48      0.54      1573
      Chronic Fatigue Syndrome (CFS)       0.80      0.71      0.75      1650
                            Diabetes       0.68      0.89      0.77      1254
                             Healthy       0.84      0.99      0.91      1628
                        Hypertension       0.74      0.36      0.49      1639
Respiratory Disease (COPD or Asthma)       0.60      0.29      0.39      1408
            Stress-related Disorders       0.73      0.86      0.79      1496

                         




Sample size 10000 - Accuracy: 0.7450
                                      precision    recall  f1-score   support

                             Anaemia       0.88      0.97      0.92      1914
                         Arrhythmias       0.68      0.97      0.80      1980
                     Atherosclerosis       0.54      0.44      0.48      1936
               Autonomic Dysfunction       0.84      0.98      0.91      2057
        Cardiovascular Disease (CVD)       0.60      0.48      0.53      2068
      Chronic Fatigue Syndrome (CFS)       0.80      0.73      0.76      2211
                            Diabetes       0.77      0.94      0.84      1606
                             Healthy       0.84      0.99      0.91      2101
                        Hypertension       0.77      0.45      0.57      2145
Respiratory Disease (COPD or Asthma)       0.54      0.35      0.42      1892
            Stress-related Disorders       0.76      0.92      0.83      2090

                        




Sample size 20000 - Accuracy: 0.7920
                                      precision    recall  f1-score   support

                             Anaemia       0.91      0.99      0.95      3762
                         Arrhythmias       0.76      0.98      0.86      4059
                     Atherosclerosis       0.59      0.52      0.55      3883
               Autonomic Dysfunction       0.90      0.99      0.94      3806
        Cardiovascular Disease (CVD)       0.67      0.56      0.61      4290
      Chronic Fatigue Syndrome (CFS)       0.84      0.80      0.82      4169
                            Diabetes       0.82      0.98      0.89      3641
                             Healthy       0.89      1.00      0.94      4345
                        Hypertension       0.81      0.53      0.64      4268
Respiratory Disease (COPD or Asthma)       0.60      0.41      0.49      3773
            Stress-related Disorders       0.79      0.97      0.87      4004

                        




Sample size 30000 - Accuracy: 0.8178
                                      precision    recall  f1-score   support

                             Anaemia       0.93      1.00      0.96      5731
                         Arrhythmias       0.80      1.00      0.89      6226
                     Atherosclerosis       0.57      0.51      0.54      5753
               Autonomic Dysfunction       0.92      1.00      0.96      5962
        Cardiovascular Disease (CVD)       0.65      0.59      0.62      6204
      Chronic Fatigue Syndrome (CFS)       0.85      0.83      0.84      6039
                            Diabetes       0.86      0.98      0.92      5698
                             Healthy       0.94      1.00      0.97      6358
                        Hypertension       0.87      0.60      0.71      6171
Respiratory Disease (COPD or Asthma)       0.68      0.50      0.58      5797
            Stress-related Disorders       0.82      0.99      0.90      6061

                        




Sample size 40000 - Accuracy: 0.8291
                                      precision    recall  f1-score   support

                             Anaemia       0.95      1.00      0.97      8019
                         Arrhythmias       0.83      1.00      0.91      8217
                     Atherosclerosis       0.58      0.49      0.54      7986
               Autonomic Dysfunction       0.93      1.00      0.97      8052
        Cardiovascular Disease (CVD)       0.64      0.62      0.63      8283
      Chronic Fatigue Syndrome (CFS)       0.86      0.84      0.85      8052
                            Diabetes       0.89      0.99      0.94      7711
                             Healthy       0.94      1.00      0.97      8162
                        Hypertension       0.88      0.63      0.74      8063
Respiratory Disease (COPD or Asthma)       0.68      0.54      0.61      7568
            Stress-related Disorders       0.84      0.99      0.91      7887

                        




Sample size 50000 - Accuracy: 0.8344
                                      precision    recall  f1-score   support

                             Anaemia       0.95      1.00      0.97      9856
                         Arrhythmias       0.84      1.00      0.91     10296
                     Atherosclerosis       0.59      0.50      0.54     10109
               Autonomic Dysfunction       0.94      1.00      0.97     10329
        Cardiovascular Disease (CVD)       0.63      0.62      0.63     10153
      Chronic Fatigue Syndrome (CFS)       0.86      0.84      0.85      9966
                            Diabetes       0.90      0.99      0.95      9625
                             Healthy       0.94      1.00      0.97     10197
                        Hypertension       0.88      0.64      0.74      9889
Respiratory Disease (COPD or Asthma)       0.69      0.57      0.62      9537
            Stress-related Disorders       0.85      1.00      0.92     10043

                        

<h5 style="color: SkyBlue;">XGBoost</h5>

In [22]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

def evaluate_xgb_model(model, X_test_subset, y_test_subset, class_labels):
    # Predictions
    predictions = model.predict(xgb.DMatrix(X_test_subset))
    predictions = np.argmax(predictions, axis=1)
    
    # Accuracy
    accuracy = accuracy_score(y_test_subset, predictions)

    # Dynamically identify the classes present in the test set
    unique_classes = np.unique(y_test_subset)
    
    # Ensure labels and target names are consistent
    labels = list(unique_classes)
    dynamic_labels = [class_labels[label] for label in labels]

    # Generate classification report
    report = classification_report(
        y_test_subset,
        predictions,
        labels=labels,
        target_names=dynamic_labels,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, dynamic_labels

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    params = {
        "objective": "multi:softprob",  # Use softmax for class predictions directly
        "num_class": len(class_labels),  # Ensure this is set correctly based on your classes
        "eval_metric": "mlogloss",
        "learning_rate": 0.1,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42,
    }
    
    dtrain = xgb.DMatrix(X_train_subset.values, label=y_train_subset)  # Ensure values are used
    
    # Train the model
    xgb_model = xgb.train(params, dtrain, num_boost_round=100)

    # Evaluate the model with DMatrix for expanded test set
    accuracy, report, predictions, dynamic_labels = evaluate_xgb_model(xgb_model, X_test_expanded, y_test_expanded, class_labels)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    
    # Print classification report for expanded test set
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),
        target_names=class_labels,
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = xgb_model
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: XGBoost with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")


Sample size 25 - Accuracy: 0.4000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00         0
                         Arrhythmias       0.00      0.00      0.00         0
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.33      1.00      0.50        11
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       0.00      0.00      0.00         0
                             Healthy       0.00      0.00      0.00        11
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        11
            Stress-related Disorders       0.00      0.00      0.00         0

                           

<h5 style="color: SkyBlue;">Extra Trees Classifier</h5>

In [None]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Extra Trees Model
    extra_trees = ExtraTreesClassifier(
        n_estimators=100, random_state=42, class_weight='balanced'
    )
    extra_trees.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        extra_trees, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = extra_trees
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Extra Trees with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")