In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# load data
data = pd.read_csv("./data/traininingdata.txt", sep=";")

label_encoders = (
    {}
)  # to store label encoders for each column for potential reverse transformation

for column in data.columns:
    if data[column].dtype == "object":
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le


In [2]:
import torch
import numpy as np


class BiClassfication(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=100):
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.linear(x)
        x = torch.relu(x)
        x = self.linear2(x)
        x = torch.sigmoid(x)
        return x


# transform pandas data to ndarray
data = data.values
data = data.astype(np.float32)
X = data[:, :-1]
y = data[:, -1]

# normalize data
X = (X - X.mean(axis=0)) / X.std(axis=0)

# train model
input_dim = data.shape[1] - 1
model = BiClassfication(input_dim, 1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

X = torch.from_numpy(X)
y = torch.from_numpy(y)
y = y.reshape(-1, 1)

for epoch in range(1500):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    print("epoch: ", epoch, " loss: ", loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# save model
torch.save(model.state_dict(), "./model/model.pth")

epoch:  0  loss:  0.6786839365959167
epoch:  1  loss:  0.6670499444007874
epoch:  2  loss:  0.6556861996650696
epoch:  3  loss:  0.6445948481559753
epoch:  4  loss:  0.6337763667106628
epoch:  5  loss:  0.6232300996780396
epoch:  6  loss:  0.6129554510116577
epoch:  7  loss:  0.6029512882232666
epoch:  8  loss:  0.5932157039642334
epoch:  9  loss:  0.5837465524673462
epoch:  10  loss:  0.5745415687561035
epoch:  11  loss:  0.565597653388977
epoch:  12  loss:  0.5569109916687012
epoch:  13  loss:  0.5484773516654968
epoch:  14  loss:  0.5402923226356506
epoch:  15  loss:  0.5323519110679626
epoch:  16  loss:  0.5246508121490479
epoch:  17  loss:  0.5171841382980347
epoch:  18  loss:  0.5099471211433411
epoch:  19  loss:  0.5029339790344238
epoch:  20  loss:  0.4961392283439636
epoch:  21  loss:  0.4895574152469635
epoch:  22  loss:  0.48318254947662354
epoch:  23  loss:  0.4770088791847229
epoch:  24  loss:  0.4710308015346527
epoch:  25  loss:  0.4652423858642578
epoch:  26  loss:  0.4

In [3]:
# test model
model = BiClassfication(input_dim, 1)
model.load_state_dict(torch.load("./model/model.pth"))
model.eval()

test_data = pd.read_csv("./data/testdata.txt", sep=";")
for column in test_data.columns:
    if test_data[column].dtype == "object":
        le = label_encoders[column]
        test_data[column] = le.transform(test_data[column])
        label_encoders[column] = le

test_data = test_data.values
test_data = test_data.astype(np.float32)
X_test = test_data[:, :-1]
# normalize data
X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)
y_test = test_data[:, -1]

X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)

y_pred = model(X_test)
y_pred = y_pred.detach().numpy()
y_pred = y_pred > 0.5
y_pred = y_pred.astype(np.int32)

from sklearn.metrics import accuracy_score

print("accuracy: ", accuracy_score(y_test, y_pred))

accuracy:  0.8991485126617274
