In [1]:
import pandas as pd
import numpy as np
from classify import classify_disease
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

<h5 style="color: SkyBlue;">Load Dataset</h5>

In [2]:
df = pd.read_csv('user_data_for_disease_prediction - unclassified data set.csv')
print(df.head())

   Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
0                 0       -0.9        0.4        49.7      67.9   
1     

<h5 style="color: SkyBlue;">Ranking Disease</h5>

In [3]:
df["Disease"] = df.apply(classify_disease, axis=1)

disease_counts = df["Disease"].value_counts()

disease_counts_sorted = disease_counts.sort_values(ascending=False)
print(disease_counts_sorted)

Disease
Atherosclerosis                         152809
Hypertension                            115644
Cardiovascular Disease (CVD)             99122
Chronic Fatigue Syndrome (CFS)           53545
Respiratory Disease (COPD or Asthma)     28039
Stress-related Disorders                   352
Arrhythmias                                284
Healthy                                     67
Autonomic Dysfunction                       65
Diabetes                                    48
Anaemia                                     25
Name: count, dtype: int64


In [4]:
sampled_df_health = df.sample(n=200000, random_state=42)

sampled_disease_counts = sampled_df_health["Disease"].value_counts()

sampled_disease_counts_sorted = sampled_disease_counts.sort_values(ascending=False)
print(sampled_disease_counts_sorted)

Disease
Atherosclerosis                         67905
Hypertension                            51521
Cardiovascular Disease (CVD)            43988
Chronic Fatigue Syndrome (CFS)          23828
Respiratory Disease (COPD or Asthma)    12376
Stress-related Disorders                  155
Arrhythmias                               136
Healthy                                    31
Autonomic Dysfunction                      29
Diabetes                                   24
Anaemia                                     7
Name: count, dtype: int64


<h5 style="color: SkyBlue;">Synthetic Sampling</h5>

In [5]:
def apply_smoteenn(sampled_df_health):
    """
    SMOTEENN sampling to balance the dataset.
    
    Parameters:
        sampled_df_health (DataFrame): Data containing features and the target column 'Disease'.
        
    Returns:
        X_balanced (DataFrame): Resampled features.
        y_balanced (Series): Resampled target labels.
        counts (dict): Summary of the balanced dataset.
    """
    # Feature (X) and target (y) separation
    X = sampled_df_health.drop("Disease", axis=1)
    y = sampled_df_health["Disease"]

    smote = SMOTE(k_neighbors=2, random_state=42)
    smote_enn = SMOTEENN(random_state=42, smote=smote)

    # resampling
    X_balanced, y_balanced = smote_enn.fit_resample(X, y)

    counts = {
        "Balanced Feature Shape": X_balanced.shape,
        "Balanced Target Shape": y_balanced.shape,
        "Balanced Distribution": y_balanced.value_counts().to_dict(),
    }
    
    print("Balanced Distribution:")
    for disease, count in counts["Balanced Distribution"].items():
        print(f"{disease}: {count}")

    return X_balanced, y_balanced, counts

X_resampled, y_resampled, counts = apply_smoteenn(sampled_df_health)

Balanced Distribution:
Anaemia: 67905
Arrhythmias: 67905
Autonomic Dysfunction: 67905
Stress-related Disorders: 67905
Diabetes: 67905
Healthy: 67905
Chronic Fatigue Syndrome (CFS): 66792
Respiratory Disease (COPD or Asthma): 66781
Hypertension: 48966
Cardiovascular Disease (CVD): 38457
Atherosclerosis: 23506


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_resampled_encoded = le.fit_transform(y_resampled)

<h5 style="color: SkyBlue;">Helper function batch training</h5>

In [7]:
def train_and_evaluate(model, X_data, y_data, batch_sizes):
    """
    Trains and evaluates a model over different batch sizes.
    
    Parameters:
        model: sklearn model instance
        X_data: Features for training
        y_data: Target labels
        batch_sizes: List of batch sizes for training
        
    Returns:
        None
    """
    for batch_size in batch_sizes:
        X_batch = X_data.sample(n=min(batch_size, len(X_data)), random_state=42)
        y_batch = y_data.loc[X_batch.index]
        
        model.fit(X_batch, y_batch)
        
        y_pred = model.predict(X_batch)
        print(f"\nBatch Size: {batch_size}")
        print(classification_report(y_batch, y_pred))
        print(f"Accuracy: {accuracy_score(y_batch, y_pred)}\n")

<h5 style="color: SkyBlue;">Logistic Regression</h5>

In [8]:
batch_sizes = [200, 400, 800, 1000, 2000, 15000]
logistic_model = LogisticRegression(max_iter=200, random_state=42)
train_and_evaluate(logistic_model, X_resampled, y_resampled, batch_sizes)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       0.92      1.00      0.96        23
                         Arrhythmias       0.89      0.89      0.89        18
                     Atherosclerosis       0.80      1.00      0.89         4
               Autonomic Dysfunction       0.67      0.71      0.69        14
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        14
      Chronic Fatigue Syndrome (CFS)       0.68      0.65      0.67        20
                            Diabetes       0.71      0.68      0.69        25
                             Healthy       0.77      0.81      0.79        21
                        Hypertension       0.83      0.83      0.83        18
Respiratory Disease (COPD or Asthma)       0.83      0.71      0.77        21
            Stress-related Disorders       0.59      0.59      0.59        22

                            accuracy         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Batch Size: 1000
                                      precision    recall  f1-score   support

                             Anaemia       0.80      0.84      0.82       110
                         Arrhythmias       0.78      0.79      0.78       101
                     Atherosclerosis       0.74      0.74      0.74        35
               Autonomic Dysfunction       0.50      0.42      0.46        95
        Cardiovascular Disease (CVD)       0.88      0.86      0.87        71
      Chronic Fatigue Syndrome (CFS)       0.58      0.64      0.61       101
                            Diabetes       0.58      0.61      0.59       106
                             Healthy       0.69      0.77      0.73       105
                        Hypertension       0.85      0.84      0.84        73
Respiratory Disease (COPD or Asthma)       0.73      0.63      0.68        98
            Stress-related Disorders       0.58      0.54      0.56       105

                            accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Batch Size: 15000
                                      precision    recall  f1-score   support

                             Anaemia       0.79      0.86      0.82      1547
                         Arrhythmias       0.71      0.79      0.75      1604
                     Atherosclerosis       0.74      0.77      0.76       561
               Autonomic Dysfunction       0.57      0.45      0.51      1582
        Cardiovascular Disease (CVD)       0.83      0.83      0.83       859
      Chronic Fatigue Syndrome (CFS)       0.60      0.66      0.63      1519
                            Diabetes       0.59      0.54      0.57      1551
                             Healthy       0.66      0.77      0.71      1546
                        Hypertension       0.78      0.75      0.77      1099
Respiratory Disease (COPD or Asthma)       0.74      0.62      0.68      1563
            Stress-related Disorders       0.64      0.64      0.64      1569

                            accuracy       

<h5 style="color: SkyBlue;">Decision Tree</h5>

In [9]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate(decision_tree_model, X_resampled, y_resampled, batch_sizes)


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        23
                         Arrhythmias       1.00      1.00      1.00        18
                     Atherosclerosis       1.00      1.00      1.00         4
               Autonomic Dysfunction       1.00      1.00      1.00        14
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        14
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00        20
                            Diabetes       1.00      1.00      1.00        25
                             Healthy       1.00      1.00      1.00        21
                        Hypertension       1.00      1.00      1.00        18
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00        21
            Stress-related Disorders       1.00      1.00      1.00        22

                            accuracy         

<h5 style="color: SkyBlue;">Random Forest</h5>

In [10]:
random_forest_model = RandomForestClassifier(random_state=42)
train_and_evaluate(random_forest_model, X_resampled, y_resampled, batch_sizes)


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        23
                         Arrhythmias       1.00      1.00      1.00        18
                     Atherosclerosis       1.00      1.00      1.00         4
               Autonomic Dysfunction       1.00      1.00      1.00        14
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        14
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00        20
                            Diabetes       1.00      1.00      1.00        25
                             Healthy       1.00      1.00      1.00        21
                        Hypertension       1.00      1.00      1.00        18
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00        21
            Stress-related Disorders       1.00      1.00      1.00        22

                            accuracy         

<h5 style="color: SkyBlue;">Gradient Boosting (GBM)</h5>

In [11]:
gradient_boosting_model = GradientBoostingClassifier(random_state=42)
train_and_evaluate(gradient_boosting_model, X_resampled, y_resampled, batch_sizes)


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        23
                         Arrhythmias       1.00      1.00      1.00        18
                     Atherosclerosis       1.00      1.00      1.00         4
               Autonomic Dysfunction       1.00      1.00      1.00        14
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        14
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00        20
                            Diabetes       1.00      1.00      1.00        25
                             Healthy       1.00      1.00      1.00        21
                        Hypertension       1.00      1.00      1.00        18
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00        21
            Stress-related Disorders       1.00      1.00      1.00        22

                            accuracy         

<h5 style="color: SkyBlue;">K-Nearest Neighbors (KNN)</h5>

In [12]:
knn_model = KNeighborsClassifier()
train_and_evaluate(knn_model, X_resampled, y_resampled, batch_sizes)


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       0.49      0.91      0.64        23
                         Arrhythmias       0.44      0.94      0.60        18
                     Atherosclerosis       1.00      0.50      0.67         4
               Autonomic Dysfunction       0.53      0.64      0.58        14
        Cardiovascular Disease (CVD)       0.00      0.00      0.00        14
      Chronic Fatigue Syndrome (CFS)       0.79      0.55      0.65        20
                            Diabetes       0.65      0.80      0.71        25
                             Healthy       0.78      0.86      0.82        21
                        Hypertension       1.00      0.39      0.56        18
Respiratory Disease (COPD or Asthma)       0.88      0.33      0.48        21
            Stress-related Disorders       0.73      0.50      0.59        22

                            accuracy         

<h5 style="color: SkyBlue;">XGBoost</h5>

In [15]:
# y_resampled_encoded to a Pandas Series
y_resampled_encoded = pd.Series(y_resampled_encoded, index=X_resampled.index)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
train_and_evaluate(xgb_model, X_resampled, y_resampled_encoded, batch_sizes)

Parameters: { "use_label_encoder" } are not used.




Batch Size: 200
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00        14
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00        20
           6       1.00      1.00      1.00        25
           7       1.00      1.00      1.00        21
           8       1.00      1.00      1.00        18
           9       1.00      1.00      1.00        21
          10       1.00      1.00      1.00        22

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Accuracy: 1.0



Parameters: { "use_label_encoder" } are not used.




Batch Size: 400
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        44
           1       1.00      1.00      1.00        35
           2       1.00      1.00      1.00        12
           3       1.00      1.00      1.00        41
           4       1.00      1.00      1.00        22
           5       1.00      1.00      1.00        38
           6       1.00      1.00      1.00        45
           7       1.00      1.00      1.00        45
           8       1.00      1.00      1.00        32
           9       1.00      1.00      1.00        39
          10       1.00      1.00      1.00        47

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Accuracy: 1.0



Parameters: { "use_label_encoder" } are not used.




Batch Size: 800
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        82
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        74
           4       1.00      1.00      1.00        53
           5       1.00      1.00      1.00        78
           6       1.00      1.00      1.00        83
           7       1.00      1.00      1.00        87
           8       1.00      1.00      1.00        58
           9       1.00      1.00      1.00        76
          10       1.00      1.00      1.00        89

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800

Accuracy: 1.0



Parameters: { "use_label_encoder" } are not used.




Batch Size: 1000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00       101
           2       1.00      1.00      1.00        35
           3       1.00      1.00      1.00        95
           4       1.00      1.00      1.00        71
           5       1.00      1.00      1.00       101
           6       1.00      1.00      1.00       106
           7       1.00      1.00      1.00       105
           8       1.00      1.00      1.00        73
           9       1.00      1.00      1.00        98
          10       1.00      1.00      1.00       105

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Accuracy: 1.0



Parameters: { "use_label_encoder" } are not used.




Batch Size: 2000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       213
           1       1.00      1.00      1.00       205
           2       1.00      1.00      1.00        67
           3       1.00      1.00      1.00       206
           4       1.00      1.00      1.00       132
           5       1.00      1.00      1.00       203
           6       1.00      1.00      1.00       190
           7       1.00      1.00      1.00       211
           8       1.00      1.00      1.00       153
           9       1.00      1.00      1.00       205
          10       1.00      1.00      1.00       215

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Accuracy: 1.0



Parameters: { "use_label_encoder" } are not used.




Batch Size: 15000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1547
           1       1.00      1.00      1.00      1604
           2       1.00      1.00      1.00       561
           3       1.00      1.00      1.00      1582
           4       1.00      1.00      1.00       859
           5       1.00      1.00      1.00      1519
           6       1.00      1.00      1.00      1551
           7       1.00      1.00      1.00      1546
           8       1.00      1.00      1.00      1099
           9       1.00      1.00      1.00      1563
          10       1.00      1.00      1.00      1569

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000

Accuracy: 1.0



<h5 style="color: SkyBlue;">Extra Trees Classifier</h5>

In [14]:
extra_trees_model = ExtraTreesClassifier(random_state=42)
train_and_evaluate(extra_trees_model, X_resampled, y_resampled, batch_sizes)


Batch Size: 200
                                      precision    recall  f1-score   support

                             Anaemia       1.00      1.00      1.00        23
                         Arrhythmias       1.00      1.00      1.00        18
                     Atherosclerosis       1.00      1.00      1.00         4
               Autonomic Dysfunction       1.00      1.00      1.00        14
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        14
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00        20
                            Diabetes       1.00      1.00      1.00        25
                             Healthy       1.00      1.00      1.00        21
                        Hypertension       1.00      1.00      1.00        18
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00        21
            Stress-related Disorders       1.00      1.00      1.00        22

                            accuracy         