### U ovom projektu korišten je skup podataka koji sadrži informacije o pacijentima i govori nam da li je određeni pacijent imao srčani udar ili ne. 

### Na osnovu tih podataka gradi se neuronska mreža koja uči i predviđa je li pacijent imao srčani udar ili ne

In [15]:
import torch
import pandas as pd
import numpy as np


In [16]:
# Podaci su preuzeti sa: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [17]:
stroke_df.groupby(["stroke"]).size()

stroke
0    4861
1     249
dtype: int64

In [18]:
stroke_df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [19]:
stroke_df["bmi"].mean()

28.893236911794666

In [20]:
# Popunjavanje NaN vrijednosti
stroke_df["bmi"].fillna(stroke_df["bmi"].mean(), inplace=True)
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [21]:
# Enkodiranje kategoričkih varijabli

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

X = stroke_df.drop(["id", "stroke"], axis=1)
y = stroke_df["stroke"]

categorical_columns = ["gender", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "smoking_status",]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                              one_hot,
                              categorical_columns)],
                              remainder="passthrough")
transformd_X = transformer.fit_transform(X)
transformd_X

# Podjela u setove za treniranje i testiranje
X_train, X_test, y_train, y_test = train_test_split(transformd_X, y, test_size=0.2)

In [22]:
# Skaliranje značajki da bi dobili svojstva standardne normalne distribucije 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Pretvaranje pd.Series u numpy objekt
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [23]:
# pretvaranje u PyTorch tenzore
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [24]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [25]:
# Gradnja neuronke mreže
import torch.nn as nn
import torch.nn.functional as F

class BinaryClassificationModel(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


input_dim = X_train.shape[1]
model = BinaryClassificationModel(input_dim)
criterion = nn.BCELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [26]:
# Treniranje modela 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/100, Loss: 0.31212570014758967
Epoch 2/100, Loss: 0.1683450294076465
Epoch 3/100, Loss: 0.16152892675017938
Epoch 4/100, Loss: 0.15721988504810724
Epoch 5/100, Loss: 0.15495648607611656
Epoch 6/100, Loss: 0.1528918921831064
Epoch 7/100, Loss: 0.15208406119199935
Epoch 8/100, Loss: 0.1496796555875335
Epoch 9/100, Loss: 0.1483071287511848
Epoch 10/100, Loss: 0.14694459254678804
Epoch 11/100, Loss: 0.1465168381400872
Epoch 12/100, Loss: 0.14491285663098097
Epoch 13/100, Loss: 0.14557859582419042
Epoch 14/100, Loss: 0.14321434503654018
Epoch 15/100, Loss: 0.1423710950257373
Epoch 16/100, Loss: 0.14185555739095435
Epoch 17/100, Loss: 0.140522708374192
Epoch 18/100, Loss: 0.13948870956664905
Epoch 19/100, Loss: 0.13919566036202013
Epoch 20/100, Loss: 0.1378356729983352
Epoch 21/100, Loss: 0.1369039027558756
Epoch 22/100, Loss: 0.13628718953987118
Epoch 23/100, Loss: 0.13380193621560466
Epoch 24/100, Loss: 0.13413939860765822
Epoch 25/100, Loss: 0.13222299216431566
Epoch 26/100, Loss:

In [27]:
# Evaulacija modela
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 93.84%


In [28]:
from sklearn.metrics import classification_report


model.eval()  

all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()  
        all_labels.extend(labels.cpu().numpy()) 
        all_predictions.extend(predicted.cpu().numpy()) 


report = classification_report(all_labels, all_predictions, target_names=['Class 0', 'Class 1'])
print(report)

              precision    recall  f1-score   support

     Class 0       0.97      0.97      0.97       979
     Class 1       0.24      0.21      0.22        43

    accuracy                           0.94      1022
   macro avg       0.60      0.59      0.60      1022
weighted avg       0.93      0.94      0.94      1022

