In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
df = pd.read_csv("../data/heart.csv")

In [43]:
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

# Vorverarbeitung (z. B. OneHotEncoding für Kategorien)
X = pd.get_dummies(X)
#speichern der reihenfolge für API
with open("../ml/training_columns.txt", "w") as f:
    for col in X.columns:
        f.write(f"{col}\n")



In [44]:
print(X)

     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  Sex_F  Sex_M  \
0     40        140          289          0    172      0.0  False   True   
1     49        160          180          0    156      1.0   True  False   
2     37        130          283          0     98      0.0  False   True   
3     48        138          214          0    108      1.5   True  False   
4     54        150          195          0    122      0.0  False   True   
..   ...        ...          ...        ...    ...      ...    ...    ...   
913   45        110          264          0    132      1.2  False   True   
914   68        144          193          1    141      3.4  False   True   
915   57        130          131          0    115      1.2  False   True   
916   57        130          236          0    174      0.0   True  False   
917   38        138          175          0    173      0.0  False   True   

     ChestPainType_ASY  ChestPainType_ATA  ChestPainType_NAP  \
0          

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


First sklearn

In [46]:
model = RandomForestClassifier()

precision: $p=tp/(tp+fp)$,
recall: $r=tp/(tp+fn)$,
$F_\beta$: weighted precision and recall mean= $(1+\beta)^2 \frac{p \cdot r}{\beta^2 p+recall}$

In [47]:
model.fit(X_train, y_train)

print(classification_report(y_test, model.predict(X_test)))
joblib.dump(model, "../ml/model_sklearn.pkl")

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       112
           1       0.92      0.88      0.90       164

    accuracy                           0.88       276
   macro avg       0.88      0.89      0.88       276
weighted avg       0.89      0.88      0.88       276



['../ml/model_sklearn.pkl']

High scores on first try 

Now PyTorch


In [48]:
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import sys
import os
sys.path.append(os.path.abspath(".."))
from ml.model import HeartModel

In [49]:
X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.int64))

X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.int64))

In [50]:
model = HeartModel(input_dim=X_train.shape[1])
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Training Loop
for epoch in range(30):
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

Epoch 1: Loss = 0.3069
Epoch 2: Loss = 0.3144
Epoch 3: Loss = 0.2521
Epoch 4: Loss = 1.9700
Epoch 5: Loss = 0.3968
Epoch 6: Loss = 0.0826
Epoch 7: Loss = 0.0740
Epoch 8: Loss = 0.0183
Epoch 9: Loss = 0.5940
Epoch 10: Loss = 1.2592
Epoch 11: Loss = 1.1213
Epoch 12: Loss = 0.1922
Epoch 13: Loss = 0.2317
Epoch 14: Loss = 0.0348
Epoch 15: Loss = 0.5112
Epoch 16: Loss = 0.1751
Epoch 17: Loss = 0.4008
Epoch 18: Loss = 0.1209
Epoch 19: Loss = 0.4517
Epoch 20: Loss = 0.0738
Epoch 21: Loss = 0.1296
Epoch 22: Loss = 1.5518
Epoch 23: Loss = 0.0789
Epoch 24: Loss = 0.0435
Epoch 25: Loss = 0.7086
Epoch 26: Loss = 0.2028
Epoch 27: Loss = 0.1052
Epoch 28: Loss = 0.6733
Epoch 29: Loss = 0.0184
Epoch 30: Loss = 0.0742


Overfitting in epoch 20?

In [51]:
from sklearn.metrics import accuracy_score

In [52]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)

In [53]:
y_pred = predicted.numpy()
y_true = y_test_tensor.numpy()

In [54]:
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:")
print(classification_report(y_true, y_pred))

Accuracy: 0.8297101449275363
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.94      0.82       112
           1       0.95      0.76      0.84       164

    accuracy                           0.83       276
   macro avg       0.84      0.85      0.83       276
weighted avg       0.86      0.83      0.83       276



In [55]:
torch.save(model.state_dict(), "../ml/model_pytorch.pt")

In [56]:
#Load model
#model = HeartModel(input_dim=X_test.shape[1])
#model.load_state_dict(torch.load("ml/heart_model.pt"))
#model.eval()