# Import library

In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

# Import data

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Age,Class
0,6,148,72,35,0,33.6,50,positive
1,1,85,66,29,0,26.6,31,negative
2,8,183,64,0,0,23.3,32,positive
3,1,89,66,23,94,28.1,21,negative
4,0,137,40,35,168,43.1,33,positive
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,63,negative
764,2,122,70,27,0,36.8,27,negative
765,5,121,72,23,112,26.2,30,negative
766,1,126,60,0,0,30.1,47,positive


In [4]:
x = data.iloc[:, 0:-1].values
y_string = list(data.iloc[:, -1])

In [5]:
print(x.shape, len(y_string))

(768, 7) 768


# Encoding 

In [6]:
y_int = []

for s in y_string:
    if s == 'positive':
        y_int.append(1)
    else:
        y_int.append(0)

In [None]:
y_int

In [8]:
y = np.array(y_int, dtype='float64')

In [9]:
y

array([1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0.

# Preprocessing

In [10]:
# normalze the features
sc = StandardScaler()

x = sc.fit_transform(x)

In [11]:
x

array([[ 0.63994726,  0.84832379,  0.14964075, ..., -0.69289057,
         0.20401277,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.69289057,
        -0.68442195, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -0.69289057,
        -1.10325546, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ...,  0.27959377,
        -0.73518964, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.69289057,
        -0.24020459,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.69289057,
        -0.20212881, -0.87137393]])

In [12]:
# convert array to tensor
x = torch.tensor(x)
y = torch.tensor(y)

In [13]:
x.shape, y.shape

(torch.Size([768, 7]), torch.Size([768]))

In [14]:
# add 1 dimension to y
y = y.unsqueeze(1)

# Dataset and DataLoader

Dataset

In [15]:
class Dataset(Dataset):
    def __init__(self, x, y):
        """
        Args:
            x (torch.Tensor): Fitur
            y (torch.Tensor): Label/Target
        """
        self.x = x
        self.y = y

    def __getitem__(self, index):
        """
        Mengambil data berdasarkan indeks (fitur dan label)
        Args:
            index (int): Index dari data yang ingin diambil
        """
        return self.x[index], self.y[index]
    
    def __len__(self):
        """
        Mengembalikan panjang dataset
        """
        return len(self.x)

In [21]:
# buat dataset untuk X dan Y
dataset = Dataset(x, y)

len(dataset)

32

In [18]:
# load into DataLoader
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [20]:
# melihat data pada data loader
print("Ada {} batch di dataset".format(len(train_loader)))

for (x, y) in train_loader:
    print("Untuk satu iterasi (batch) ada :")
    print("Data : {}".format(x.shape))
    print("Label : {}".format(y.shape))
    break

Ada 24 batch di dataset
Untuk satu iterasi (batch) ada :
Data : torch.Size([32, 7])
Label : torch.Size([32, 1])


# Model

Arsitektur : 
- 7 neurons (input)
- 5 neurons
- 4 neurons
- 3 neurons
- 1 neurons (output)

In [22]:
class Model(nn.Module):
    def __init__(self, input_features, output_features):
        super(Model, self).__init__()
        """
        Args : 
            input_features (int) : jumlah fitur input
            output_features (int) : jumlah fitur output
        """
        # layer
        self.fc1 = nn.Linear(input_features, 5)
        self.fc2 = nn.Linear(5, 4)
        self.fc3 = nn.Linear(4, 3)
        self.fc4 = nn.Linear(3, output_features)
        # activation function
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

    def forward(self, x):
        """
        Proses forward pass
        """
        out = self.fc1(x)
        out = self.tanh(out)
        out = self.fc2(out)
        out = self.tanh(out)
        out = self.fc3(out)
        out = self.tanh(out)
        out = self.fc4(out)
        out = self.sigmoid(out)

        return out

In [23]:
# inisialisasi model
net = Model(input_features=7, output_features=1)

In [24]:
# inisialisasi loss function
criterion = nn.BCELoss(size_average=True) # loss akan dihitung rata-rata dari semua batch

# inisialisasi optimizer
optimizer = torch.optim.SGD(net.parameters(), lr = 0.1, momentum=0.9)



# Training

In [26]:
epochs = 10

for epoch in range(10):

    for inputs, labels in train_loader:
        '''
        Loop akan meeksekusi sebanyak jumlah batch pada DataLoader.
        Ada beberapa langkah pada loop ini : 
        1. Forward : menghitung output dari model
        2. Hitung loss : menghitung loss antara output model dengan label yang sebenarnya
        3. Backward pass : menghitung gradien dari loss terhadap parameter model, sebelumnya kembalikan graiden ke nol
        4. Update parameter : mengupdate parameter model berdasarkan gradien yang telah dihitung
        '''
        # make sure inputs and labels is float
        inputs = inputs.float()
        labels = labels.float()
        # 1. Forward
        outputs = net(inputs)
        # 2. Hitung loss
        loss = criterion(outputs, labels)
        # 3. Kembalikan gradien ke nol (buffer gradien)
        optimizer.zero_grad()
        # 4. Backward (backprop)
        loss.backward()
        # 5. Update bobot (parameter)
        optimizer.step()

    # Hitung akurasi
    output = (outputs > 0.5).float()
    accuracy = (output == labels).float().mean()

    # print
    print("Epoch {}/{}, Loss : {:.3f}, Akurasi : {:.3f}".format(epoch + 1, epochs, loss, accuracy))

Epoch 1/10, Loss : 0.748, Akurasi : 0.438
Epoch 2/10, Loss : 0.391, Akurasi : 0.875
Epoch 3/10, Loss : 0.656, Akurasi : 0.562
Epoch 4/10, Loss : 0.651, Akurasi : 0.688
Epoch 5/10, Loss : 0.530, Akurasi : 0.750
Epoch 6/10, Loss : 0.425, Akurasi : 0.844
Epoch 7/10, Loss : 0.428, Akurasi : 0.750
Epoch 8/10, Loss : 0.436, Akurasi : 0.781
Epoch 9/10, Loss : 0.474, Akurasi : 0.781
Epoch 10/10, Loss : 0.471, Akurasi : 0.781
