In [14]:
import pandas as pd
from torch.utils.data import Dataset


class WaterDataset(Dataset):
    def __init__(self, csv_path):
        super().__init__()
        # Load data to pandas DataFrame
        df = pd.read_csv(csv_path)
        # Convert data to a NumPy array and assign to self.data
        self.data = df.to_numpy().astype(float)

    # Implement __len__ to return the number of data samples
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        features = self.data[idx, :-1]
        # Assign last data column to label
        label = self.data[idx, -1]
        return features, label

In [27]:
from torch.utils.data import DataLoader

# Create an instance of the WaterDataset
dataset_train = WaterDataset('water_potability/water_train.csv')

# Create a DataLoader based on dataset_train
dataloader_train = DataLoader(
    dataset_train,
    batch_size=2,
    shuffle=True,
)

# Get a batch of features and labels
features, labels = next(iter(dataloader_train))
print(features, labels)

tensor([[0.4077, 0.3888, 0.3353, 0.4676, 0.5752, 0.5531, 0.4969, 0.6225, 0.5339],
        [0.4026, 0.4616, 0.4695, 0.3764, 0.5797, 0.2016, 0.4102, 0.2095, 0.5361]],
       dtype=torch.float64) tensor([0., 1.], dtype=torch.float64)


In [28]:
import torch.nn as nn


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define the three linear layers
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        # Pass x through linear layers adding activations
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [29]:
def train_model(optimizer, net, num_epochs):
    criterion = nn.BCELoss()
    for epoch in range(num_epochs):
        running_loss = 0.
        for features, labels in dataloader_train:
            optimizer.zero_grad()
            outputs = net(features.float())
            loss = criterion(outputs, labels.float().view(-1, 1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
    train_loss = running_loss / len(dataloader_train)
    print(f"Training loss after {num_epochs} epochs: {train_loss}")

In [30]:
import torch.optim as optim

net = Net()

# Define the SGD optimizer
optimizer = optim.SGD(net.parameters(), lr=0.001)

train_model(
    optimizer=optimizer,
    net=net,
    num_epochs=10,
)

Training loss after 10 epochs: 0.673655244889879


In [31]:
# Define the RMSprop optimizer
optimizer = optim.RMSprop(net.parameters(), lr=0.001)

train_model(
    optimizer=optimizer,
    net=net,
    num_epochs=10,
)

Training loss after 10 epochs: 0.6716785424584103


In [32]:
# Define the Adam optimizer
optimizer = optim.Adam(net.parameters(), lr=0.001)

train_model(
    optimizer=optimizer,
    net=net,
    num_epochs=10,
)

Training loss after 10 epochs: 0.658821379237213


In [36]:
dataset_test = WaterDataset('water_potability/water_test.csv')

# Create a DataLoader based on dataset_test
dataloader_test = DataLoader(
    dataset_test,
    batch_size=2,
    shuffle=True,
)
next(iter(dataloader_test))

[tensor([[0.6762, 0.6174, 0.5584, 0.9035, 0.5294, 0.5296, 0.6810, 0.6309, 0.3726],
         [0.5324, 0.6141, 0.2567, 0.4551, 0.6904, 0.5070, 0.2701, 0.4640, 0.5566]],
        dtype=torch.float64),
 tensor([1., 1.], dtype=torch.float64)]

In [40]:
import torch
from torchmetrics import Accuracy

def compute_acc(dataloader, model):
    # Set up binary accuracy metric
    acc = Accuracy(task='binary')
    model.eval()
    with torch.no_grad():
        for features, labels in dataloader:
            # Get predicted probabilities for test data batch
            outputs = model(features.float())
            preds = (outputs >= 0.5).float()
            acc(preds, labels.view(-1, 1))
    
    # Compute total test accuracy
    test_accuracy = acc.compute()
    print(f"Test accuracy: {test_accuracy}")
compute_acc(dataloader_test, net)    

Test accuracy: 0.5944334268569946


In [38]:
from torch.nn import init


class ImprovedNet(nn.Module):
    def __init__(self):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        # Add two batch normalization layers
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(8)
        # Apply He initialization
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity='sigmoid')

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)

        # Pass x through the second set of layers
        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.functional.elu(x)

        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [43]:
improved_net = ImprovedNet()
train_model(
    optimizer=optimizer,
    net=improved_net,
    num_epochs=100,
)

Training loss after 100 epochs: 0.6760912502001705


In [44]:
compute_acc(dataloader_test, improved_net)

Test accuracy: 0.5904572606086731
