In [1]:
import pickle
import pandas as pd
import numpy as np
from smote_sample import X, y
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

Disease
Atherosclerosis                         152809
Hypertension                            115644
Cardiovascular Disease (CVD)             99122
Chronic Fatigue Syndrome (CFS)           53545
Respiratory Disease (COPD or Asthma)     28039
Stress-related Disorders                   352
Arrhythmias                                284
Healthy                                     67
Autonomic Dysfunction                       65
Diabetes                                    48
Anaemia                                     25
Name: count, dtype: int64


In [2]:
with open("balanced_data.pkl", "rb") as f:
    X_balanced, y_balanced = pickle.load(f)

print("Loaded Feature Shape:", X_balanced.shape)
print("Loaded Target Shape:", y_balanced.shape)
print("Loaded Distribution after SMOTE:\n", y_balanced.value_counts())

Loaded Feature Shape: (1680899, 17)
Loaded Target Shape: (1680899,)
Loaded Distribution after SMOTE:
 Disease
Chronic Fatigue Syndrome (CFS)          152809
Atherosclerosis                         152809
Hypertension                            152809
Cardiovascular Disease (CVD)            152809
Respiratory Disease (COPD or Asthma)    152809
Autonomic Dysfunction                   152809
Arrhythmias                             152809
Anaemia                                 152809
Stress-related Disorders                152809
Diabetes                                152809
Healthy                                 152809
Name: count, dtype: int64


In [3]:
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

training_sample_sizes = [max(size - size // 5, 1) for size in sample_sizes]  
testing_sample_sizes = [size - train_size for size, train_size in zip(sample_sizes, training_sample_sizes)]  

print("Training sample sizes:", training_sample_sizes)
print("Testing sample sizes:", testing_sample_sizes)

Training sample sizes: [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
Testing sample sizes: [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]


In [4]:
label_encoder = LabelEncoder()
y_balanced_encoded = label_encoder.fit_transform(y_balanced)
class_labels = label_encoder.classes_[np.unique(y_balanced_encoded)]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Helper functions remain the same (evaluate_model, expand_test_set, flatten_classification_report)

def evaluate_model(model, X_test, y_test, class_labels):
    # Predict the labels
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Ensure all classes are present in the classification report
    full_class_indices = np.arange(len(class_labels))
    report = classification_report(
        y_test,
        predictions,
        labels=full_class_indices,  # Include all classes
        target_names=class_labels,  # Use all class labels
        zero_division=0,  # Avoid errors for missing classes
        output_dict=True
    )
    return accuracy, report, predictions

def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flattened_report = {
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                flattened_report[f"{label}_{metric_name}"] = value
    return flattened_report

# Main training loop
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

for sample_size, train_size, test_size in zip(
    sample_sizes, training_sample_sizes, testing_sample_sizes
):
    # Subset training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand test set
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Random Forest Model
    random_forest = RandomForestClassifier(
        class_weight='balanced', random_state=42, n_jobs=-1
    )
    random_forest.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions = evaluate_model(
        random_forest, X_test_expanded, y_test_expanded, class_labels
    )

    # Dynamically adjust class labels for the report
    unique_classes = np.unique(y_test_expanded)
    dynamic_class_labels = label_encoder.inverse_transform(unique_classes)

    # Print iteration results
    print(f"\nSample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.arange(len(class_labels)),  # Include all class indices
        target_names=class_labels,  # Use the full set of class labels
        zero_division=0
    ))

    # Flatten and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Check for best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = random_forest
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")

# Print best model summary
print(f"\nBest Model: Random Forest with sample size {best_sample_size}")
print(f"Best Accuracy: {best_accuracy:.4f}")




Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       0.0
                         Arrhythmias       0.00      0.00      0.00       0.0
                     Atherosclerosis       0.00      0.00      0.00       0.0
               Autonomic Dysfunction       0.00      0.00      0.00      11.0
        Cardiovascular Disease (CVD)       0.00      0.00      0.00       0.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                            Diabetes       0.00      0.00      0.00       0.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0
            Stress-related Disorders       0.00      0.00      0.00       0.0

                           




Sample size 75 - Accuracy: 0.5333
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        11
                         Arrhythmias       1.00      1.00      1.00        11
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       0.20      0.50      0.29        22
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       1.00      1.00      1.00        11
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       1.00      0.67      0.80        33
            Stress-related Disorders       0.50      1.00      0.67        22

                           




Sample size 250 - Accuracy: 0.9400
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        33
                         Arrhythmias       0.75      1.00      0.86        33
                     Atherosclerosis       0.00      0.00      0.00         0
               Autonomic Dysfunction       1.00      1.00      1.00        88
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        44
      Chronic Fatigue Syndrome (CFS)       1.00      0.83      0.91        66
                            Diabetes       1.00      1.00      1.00        55
                             Healthy       1.00      1.00      1.00        55
                        Hypertension       1.00      0.83      0.91        66
Respiratory Disease (COPD or Asthma)       1.00      0.75      0.86        44
            Stress-related Disorders       0.75      1.00      0.86        66

                          




Sample size 750 - Accuracy: 0.9533
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       187
                         Arrhythmias       1.00      1.00      1.00        99
                     Atherosclerosis       0.90      0.82      0.86       121
               Autonomic Dysfunction       0.95      1.00      0.97       209
        Cardiovascular Disease (CVD)       1.00      0.92      0.96       132
      Chronic Fatigue Syndrome (CFS)       1.00      0.92      0.96       132
                            Diabetes       1.00      1.00      1.00       110
                             Healthy       1.00      1.00      1.00       110
                        Hypertension       0.94      0.89      0.91       198
Respiratory Disease (COPD or Asthma)       0.81      0.94      0.87       198
            Stress-related Disorders       1.00      1.00      1.00       154

                          




Sample size 2500 - Accuracy: 0.9900
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       517
                         Arrhythmias       1.00      1.00      1.00       462
                     Atherosclerosis       0.98      0.98      0.98       605
               Autonomic Dysfunction       1.00      1.00      1.00       495
        Cardiovascular Disease (CVD)       1.00      0.98      0.99       517
      Chronic Fatigue Syndrome (CFS)       1.00      0.94      0.97       539
                            Diabetes       1.00      1.00      1.00       396
                             Healthy       1.00      1.00      1.00       484
                        Hypertension       0.98      1.00      0.99       583
Respiratory Disease (COPD or Asthma)       0.93      1.00      0.97       462
            Stress-related Disorders       1.00      1.00      1.00       440

                         




Sample size 5000 - Accuracy: 0.9960
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00       968
                         Arrhythmias       0.99      1.00      0.99       891
                     Atherosclerosis       0.99      0.99      0.99      1089
               Autonomic Dysfunction       1.00      1.00      1.00      1034
        Cardiovascular Disease (CVD)       1.00      0.99      1.00      1122
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      1089
                            Diabetes       1.00      1.00      1.00       792
                             Healthy       1.00      1.00      1.00      1012
                        Hypertension       0.99      1.00      1.00      1133
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99       869
            Stress-related Disorders       1.00      1.00      1.00      1001

                         




Sample size 7500 - Accuracy: 0.9973
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1463
                         Arrhythmias       1.00      1.00      1.00      1375
                     Atherosclerosis       0.99      0.99      0.99      1507
               Autonomic Dysfunction       1.00      1.00      1.00      1507
        Cardiovascular Disease (CVD)       1.00      0.99      1.00      1573
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      1650
                            Diabetes       1.00      1.00      1.00      1254
                             Healthy       1.00      1.00      1.00      1628
                        Hypertension       0.99      0.99      0.99      1639
Respiratory Disease (COPD or Asthma)       0.99      1.00      1.00      1408
            Stress-related Disorders       0.99      1.00      1.00      1496

                         




Sample size 10000 - Accuracy: 0.9975
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      1914
                         Arrhythmias       1.00      1.00      1.00      1980
                     Atherosclerosis       0.99      1.00      0.99      1936
               Autonomic Dysfunction       1.00      1.00      1.00      2057
        Cardiovascular Disease (CVD)       1.00      0.99      0.99      2068
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      2211
                            Diabetes       1.00      1.00      1.00      1606
                             Healthy       1.00      1.00      1.00      2101
                        Hypertension       0.99      0.99      0.99      2145
Respiratory Disease (COPD or Asthma)       0.99      0.99      0.99      1892
            Stress-related Disorders       0.99      1.00      1.00      2090

                        




Sample size 20000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      3762
                         Arrhythmias       1.00      1.00      1.00      4059
                     Atherosclerosis       1.00      1.00      1.00      3883
               Autonomic Dysfunction       1.00      1.00      1.00      3806
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      4290
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      4169
                            Diabetes       1.00      1.00      1.00      3641
                             Healthy       1.00      1.00      1.00      4345
                        Hypertension       1.00      1.00      1.00      4268
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      3773
            Stress-related Disorders       1.00      1.00      1.00      4004

                        




Sample size 30000 - Accuracy: 0.9990
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      5731
                         Arrhythmias       1.00      1.00      1.00      6226
                     Atherosclerosis       1.00      1.00      1.00      5753
               Autonomic Dysfunction       1.00      1.00      1.00      5962
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      6204
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      6039
                            Diabetes       1.00      1.00      1.00      5698
                             Healthy       1.00      1.00      1.00      6358
                        Hypertension       1.00      1.00      1.00      6171
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      5797
            Stress-related Disorders       1.00      1.00      1.00      6061

                        




Sample size 40000 - Accuracy: 0.9986
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      8019
                         Arrhythmias       1.00      1.00      1.00      8217
                     Atherosclerosis       1.00      1.00      1.00      7986
               Autonomic Dysfunction       1.00      1.00      1.00      8052
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      8283
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00      8052
                            Diabetes       1.00      1.00      1.00      7711
                             Healthy       1.00      1.00      1.00      8162
                        Hypertension       1.00      1.00      1.00      8063
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      7568
            Stress-related Disorders       1.00      1.00      1.00      7887

                        




Sample size 50000 - Accuracy: 0.9989
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00      9856
                         Arrhythmias       1.00      1.00      1.00     10296
                     Atherosclerosis       1.00      1.00      1.00     10109
               Autonomic Dysfunction       1.00      1.00      1.00     10329
        Cardiovascular Disease (CVD)       1.00      1.00      1.00     10153
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      0.99      9966
                            Diabetes       1.00      1.00      1.00      9625
                             Healthy       1.00      1.00      1.00     10197
                        Hypertension       1.00      1.00      1.00      9889
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      9537
            Stress-related Disorders       1.00      1.00      1.00     10043

                        

In [5]:
# save model as pickle
model_file_name = "random_forest_model.pkl"
with open(model_file_name, "wb") as f:
    pickle.dump(best_model, f)

print(f"Best model saved to {model_file_name}")

Best model saved to random_forest_model.pkl
