In [1]:
import os
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

# Create folders
os.makedirs('models', exist_ok=True)
os.makedirs('confusion_matrices', exist_ok=True)

# Load dataset
df = pd.read_csv('data.csv')

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
joblib.dump(label_encoder, 'models/label_encoder.pkl')

# Split & scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'models/scaler.pkl')

# Models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "SVC": SVC(probability=True),
    "KNN": KNeighborsClassifier()
}

# Training
for name, model in tqdm(models.items(), desc="Training models"):
    print(f"\nTraining {name}")
    model.fit(X_train_scaled, y_train)
    joblib.dump(model, f'models/{name}.pkl')

    y_pred = model.predict(X_test_scaled)

    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'confusion_matrices/{name}_confusion_matrix.png')
    plt.close()

Training models:   0%|          | 0/5 [00:00<?, ?it/s]


Training LogisticRegression

LogisticRegression Classification Report:

                                precision    recall  f1-score   support

                       Healthy       0.85      0.86      0.86        81
        Iron deficiency anemia       0.85      0.82      0.84        34
                      Leukemia       1.00      0.20      0.33         5
Leukemia with thrombocytopenia       1.00      0.33      0.50         3
             Macrocytic anemia       0.00      0.00      0.00         1
 Normocytic hypochromic anemia       0.74      0.71      0.73        56
Normocytic normochromic anemia       0.58      0.83      0.68        46
       Other microcytic anemia       0.40      0.14      0.21        14
              Thrombocytopenia       0.81      0.76      0.79        17

                      accuracy                           0.75       257
                     macro avg       0.69      0.52      0.55       257
                  weighted avg       0.75      0.75      0.74

Training models:  20%|██        | 1/5 [00:00<00:01,  2.90it/s]


Training DecisionTree

DecisionTree Classification Report:

                                precision    recall  f1-score   support

                       Healthy       1.00      1.00      1.00        81
        Iron deficiency anemia       1.00      1.00      1.00        34
                      Leukemia       1.00      1.00      1.00         5
Leukemia with thrombocytopenia       0.60      1.00      0.75         3
             Macrocytic anemia       1.00      1.00      1.00         1
 Normocytic hypochromic anemia       1.00      1.00      1.00        56
Normocytic normochromic anemia       1.00      0.98      0.99        46
       Other microcytic anemia       1.00      0.93      0.96        14
              Thrombocytopenia       1.00      1.00      1.00        17

                      accuracy                           0.99       257
                     macro avg       0.96      0.99      0.97       257
                  weighted avg       1.00      0.99      0.99       257



Training models:  40%|████      | 2/5 [00:00<00:01,  2.66it/s]


Training RandomForest

RandomForest Classification Report:

                                precision    recall  f1-score   support

                       Healthy       0.99      0.98      0.98        81
        Iron deficiency anemia       1.00      1.00      1.00        34
                      Leukemia       1.00      0.80      0.89         5
Leukemia with thrombocytopenia       1.00      1.00      1.00         3
             Macrocytic anemia       0.00      0.00      0.00         1
 Normocytic hypochromic anemia       1.00      1.00      1.00        56
Normocytic normochromic anemia       0.98      1.00      0.99        46
       Other microcytic anemia       1.00      1.00      1.00        14
              Thrombocytopenia       0.89      1.00      0.94        17

                      accuracy                           0.98       257
                     macro avg       0.87      0.86      0.87       257
                  weighted avg       0.98      0.98      0.98       257



Training models:  60%|██████    | 3/5 [00:01<00:01,  1.73it/s]


Training SVC

SVC Classification Report:

                                precision    recall  f1-score   support

                       Healthy       0.91      0.90      0.91        81
        Iron deficiency anemia       0.90      0.76      0.83        34
                      Leukemia       0.33      0.20      0.25         5
Leukemia with thrombocytopenia       0.00      0.00      0.00         3
             Macrocytic anemia       0.00      0.00      0.00         1
 Normocytic hypochromic anemia       0.66      0.79      0.72        56
Normocytic normochromic anemia       0.58      0.78      0.67        46
       Other microcytic anemia       0.00      0.00      0.00        14
              Thrombocytopenia       0.81      0.76      0.79        17

                      accuracy                           0.75       257
                     macro avg       0.47      0.47      0.46       257
                  weighted avg       0.71      0.75      0.73       257



Training models:  80%|████████  | 4/5 [00:02<00:00,  1.71it/s]


Training KNN

KNN Classification Report:

                                precision    recall  f1-score   support

                       Healthy       0.78      0.94      0.85        81
        Iron deficiency anemia       0.70      0.68      0.69        34
                      Leukemia       0.00      0.00      0.00         5
Leukemia with thrombocytopenia       0.00      0.00      0.00         3
             Macrocytic anemia       0.00      0.00      0.00         1
 Normocytic hypochromic anemia       0.61      0.68      0.64        56
Normocytic normochromic anemia       0.74      0.70      0.72        46
       Other microcytic anemia       0.60      0.43      0.50        14
              Thrombocytopenia       1.00      0.53      0.69        17

                      accuracy                           0.72       257
                     macro avg       0.49      0.44      0.46       257
                  weighted avg       0.70      0.72      0.70       257



Training models: 100%|██████████| 5/5 [00:02<00:00,  2.00it/s]
