In [None]:
import pickle
import pandas as pd
import numpy as np
from smote_sample import X, y
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

<h5 style="color: SkyBlue;">Load Dataset</h5>

In [5]:
df = pd.read_csv('user_data_for_disease_prediction - unclassified data set.csv')
print(df.head())

   Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
0                 0       -0.9        0.4        49.7      67.9   
1     

In [None]:
with open("balanced_data.pkl", "rb") as f:
    X_balanced, y_balanced = pickle.load(f)

print("Loaded Feature Shape:", X_balanced.shape)
print("Loaded Target Shape:", y_balanced.shape)
print("Loaded Distribution after SMOTE:\n", y_balanced.value_counts())

Disease
Atherosclerosis                         152809
Hypertension                            152809
Cardiovascular Disease (CVD)            152809
Chronic Fatigue Syndrome (CFS)          152809
Respiratory Disease (COPD or Asthma)    152809
Stress-related Disorders                152809
Arrhythmias                             152809
Healthy                                 152809
Autonomic Dysfunction                   152809
Diabetes                                152809
Anaemia                                 152809
Name: count, dtype: int64


<h5 style="color: SkyBlue;">Stratify Sampling</h5>

In [None]:
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

training_sample_sizes = [max(size - size // 5, 1) for size in sample_sizes]  
testing_sample_sizes = [size - train_size for size, train_size in zip(sample_sizes, training_sample_sizes)]  

print("Training sample sizes:", training_sample_sizes)
print("Testing sample sizes:", testing_sample_sizes)

Training sample sizes: [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
Testing sample sizes: [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]


<h5 style="color: SkyBlue;">Logistic Regression</h5>

In [13]:
log_reg = LogisticRegression(max_iter=2000)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    log_reg.fit(X_train, y_train)
    
    y_pred = log_reg.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

log_reg_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(log_reg_results_df.T)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        1.000000  1.000000  1.000000  1.000000   
Atherosclerosis                       0.000000  0.000000  0.000000  0.000000   
Hypertension                          0.000000  0.000000  1.000000  0.000000   
Cardiovascular Disease (CVD)          0.000000  0.000000  0.000000  1.000000   
Respiratory Disease (COPD or Asthma)  0.000000  0.666667  0.666667  0.000000   
Autonomic Dysfunction                 0.500000  0.500000  0.000000  0.000000   
Arrhythmias                           0.666667  0.500000  0.000000  0.666667   
Anaemia                               0.000000  0.000000  0.000000  0.666667   
Stress-related Disorders              0.000000  0.000000  0.000000  1.000000   
Diabetes                              0.500000  0.000000  0.000000  0.000000   
Healthy                               0.000000  0.000000  0.000000  0.000000   
Accuracy (Overall)                    0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<h5 style="color: SkyBlue;">Decision Tree</h5>

In [15]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    decision_tree_model.fit(X_train, y_train)
    
    y_pred = decision_tree_model.predict(X_test)
    
    f2 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f2)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

decision_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(decision_results_df.T)

                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        1.000000  0.000000  1.000000  1.000000   
Atherosclerosis                       0.000000  1.000000  1.000000  1.000000   
Hypertension                          1.000000  0.000000  0.000000  0.000000   
Cardiovascular Disease (CVD)          0.000000  1.000000  1.000000  1.000000   
Respiratory Disease (COPD or Asthma)  0.000000  0.666667  0.666667  1.000000   
Autonomic Dysfunction                 0.666667  1.000000  0.666667  1.000000   
Arrhythmias                           0.000000  1.000000  1.000000  1.000000   
Anaemia                               0.000000  1.000000  0.000000  1.000000   
Stress-related Disorders              0.400000  1.000000  0.000000  0.666667   
Diabetes                              0.000000  1.000000  1.000000  1.000000   
Healthy                               0.000000  0.666667  0.666667  1.000000   
Accuracy (Overall)                    0.

<h5 style="color: SkyBlue;">Random Forest</h5>

In [16]:
random_forest_model = RandomForestClassifier(random_state=42)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    random_forest_model.fit(X_train, y_train)
    
    y_pred = random_forest_model.predict(X_test)
    
    f3 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f3)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

randomforest_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(randomforest_results_df.T)

                                         20        40        60     80     \
Chronic Fatigue Syndrome (CFS)        1.000000  0.666667  0.666667    1.0   
Atherosclerosis                       0.000000  0.000000  1.000000    1.0   
Hypertension                          0.500000  0.000000  0.000000    1.0   
Cardiovascular Disease (CVD)          0.000000  1.000000  1.000000    1.0   
Respiratory Disease (COPD or Asthma)  0.000000  0.000000  0.000000    1.0   
Autonomic Dysfunction                 0.000000  0.500000  0.666667    1.0   
Arrhythmias                           1.000000  0.000000  0.666667    1.0   
Anaemia                               0.000000  0.666667  0.000000    1.0   
Stress-related Disorders              0.000000  0.000000  0.000000    1.0   
Diabetes                              0.000000  1.000000  1.000000    1.0   
Healthy                               0.000000  0.000000  0.666667    1.0   
Accuracy (Overall)                    0.272727  0.454545  0.636364    1.0   

<h5 style="color: SkyBlue;">Gradient Boosting (GBM)</h5>

In [17]:
gradient_boosting_model = GradientBoostingClassifier(random_state=42)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    gradient_boosting_model.fit(X_train, y_train)
    
    y_pred = gradient_boosting_model.predict(X_test)
    
    f4 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f4)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

gbm_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(gbm_results_df.T)

                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        0.400000  1.000000  0.000000  0.000000   
Atherosclerosis                       1.000000  0.000000  1.000000  0.666667   
Hypertension                          0.000000  0.000000  0.000000  0.000000   
Cardiovascular Disease (CVD)          1.000000  0.000000  0.000000  0.000000   
Respiratory Disease (COPD or Asthma)  0.000000  1.000000  0.000000  1.000000   
Autonomic Dysfunction                 0.000000  0.000000  0.000000  1.000000   
Arrhythmias                           0.000000  1.000000  0.666667  0.000000   
Anaemia                               0.000000  1.000000  0.500000  0.000000   
Stress-related Disorders              0.000000  0.000000  0.000000  0.500000   
Diabetes                              1.000000  0.666667  0.000000  0.000000   
Healthy                               0.000000  0.000000  0.000000  0.666667   
Accuracy (Overall)                    0.

<h5 style="color: SkyBlue;">K-Nearest Neighbors (KNN)</h5>

In [18]:
knn_model = KNeighborsClassifier()

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    knn_model.fit(X_train, y_train)
    
    y_pred = knn_model.predict(X_test)
    
    f5 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f5)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

knn_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(knn_results_df.T)

                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        0.000000  0.000000  0.000000  0.500000   
Atherosclerosis                       0.000000  0.000000  0.000000  0.000000   
Hypertension                          0.000000  0.000000  1.000000  0.000000   
Cardiovascular Disease (CVD)          0.000000  0.000000  0.000000  0.000000   
Respiratory Disease (COPD or Asthma)  0.285714  0.000000  0.000000  0.000000   
Autonomic Dysfunction                 0.500000  0.500000  0.000000  0.000000   
Arrhythmias                           1.000000  0.000000  0.500000  0.666667   
Anaemia                               0.000000  0.500000  0.000000  0.500000   
Stress-related Disorders              0.000000  0.000000  0.000000  0.000000   
Diabetes                              0.000000  0.000000  0.000000  0.000000   
Healthy                               0.000000  0.000000  0.000000  0.000000   
Accuracy (Overall)                    0.

<h5 style="color: SkyBlue;">XGBoost</h5>

In [22]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=train_size, test_size=test_size, stratify=y_encoded)
    
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    
    f6 = f1_score(y_test, y_pred, average=None, labels=range(len(label_encoder.classes_)))
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f6)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

xgb_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(xgb_results_df.T)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        0.000000  0.000000  1.000000  0.666667   
Atherosclerosis                       0.500000  1.000000  0.666667  0.000000   
Hypertension                          1.000000  0.000000  0.000000  1.000000   
Cardiovascular Disease (CVD)          0.000000  0.000000  1.000000  0.000000   
Respiratory Disease (COPD or Asthma)  0.000000  0.666667  0.000000  0.000000   
Autonomic Dysfunction                 0.666667  0.666667  1.000000  0.666667   
Arrhythmias                           0.000000  1.000000  0.666667  1.000000   
Anaemia                               0.000000  0.000000  1.000000  1.000000   
Stress-related Disorders              0.000000  0.000000  0.000000  0.000000   
Diabetes                              0.000000  0.000000  1.000000  0.000000   
Healthy                               0.000000  0.000000  0.000000  0.666667   
Accuracy (Overall)                    0.

<h5 style="color: SkyBlue;">Extra Trees Classifier</h5>

In [23]:
extra_trees_model = ExtraTreesClassifier(random_state=42)

f1_scores = []
accuracy_scores = []

min_class_samples = 1 

for train_size in training_sample_sizes:
    test_size = min_class_samples * 11  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y)
    
    extra_trees_model.fit(X_train, y_train)
    
    y_pred = extra_trees_model.predict(X_test)
    
    f7 = f1_score(y_test, y_pred, average=None, labels=df["Disease"].unique()) 
    
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_scores.append(f7)
    accuracy_scores.append(accuracy)

diseases = df["Disease"].unique()  
f1_scores_df = pd.DataFrame(f1_scores, columns=diseases, index=training_sample_sizes)

accuracy_df = pd.DataFrame(accuracy_scores, columns=["Accuracy (Overall)"], index=training_sample_sizes)

extratrees_results_df = pd.concat([f1_scores_df, accuracy_df], axis=1)

print(extratrees_results_df.T)

                                         20        40        60        80     \
Chronic Fatigue Syndrome (CFS)        1.000000  0.400000  1.000000  1.000000   
Atherosclerosis                       0.000000  0.000000  0.666667  0.000000   
Hypertension                          0.000000  0.000000  1.000000  1.000000   
Cardiovascular Disease (CVD)          0.000000  0.000000  0.000000  0.666667   
Respiratory Disease (COPD or Asthma)  0.000000  0.000000  1.000000  1.000000   
Autonomic Dysfunction                 0.000000  1.000000  0.666667  1.000000   
Arrhythmias                           0.000000  0.000000  1.000000  0.666667   
Anaemia                               0.500000  0.000000  0.000000  1.000000   
Stress-related Disorders              1.000000  0.000000  0.000000  1.000000   
Diabetes                              1.000000  0.000000  1.000000  0.000000   
Healthy                               0.000000  0.666667  0.000000  1.000000   
Accuracy (Overall)                    0.