In [67]:
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
from torchmetrics import Accuracy
from sklearn.model_selection import train_test_split

# PyTorch Dataset


Before model training can commence, you need to load the data and pass it to the model in the right format. In PyTorch, this is handled by Datasets and DataLoaders. Let's start with building a PyTorch Dataset for our water potability data.

In this exercise, you will define a class called WaterDataset to load the data from a CSV file. To do this, you will need to implement the three methods which PyTorch expects a Dataset to have:

* .__init__() to load the data,
* .__len__() to return data size,
* .__getitem()__ to extract features and label for a single sample.

In the .__init__() method, load the data from csv_path to a pandas DataFrame and assign it to df.

Convert df to a NumPy array and assign the result to self.data.

In the .__getitem__() method, get the label by slicing self.data to extract its last column for the index idx, similarly to how it's done for the features.

In [4]:
csv_path = "/kaggle/input/water-protability/water_potability.csv"

In [40]:
# class WaterDataset(Dataset):
#     def __init__(self, csv_path):
#         super().__init__()
#         # Load data to pandas DataFrame
# #         df = pd.read_csv(csv_path)
        
#         self.data = pd.read_csv(csv_path).fillna(0)  # Fill NaN values with 0
#         # Convert data to a NumPy array and assign to self.data
#         self.data = df.to_numpy()
    
#      # Implement __len__ to return the number of data samples
#     def __len__(self):
#         return self.data.shape[0]
    
#     def __getitem__(self, idx):
#         features = self.data[idx, :-1]
#          # Assign last data column to label
#         label = self.data[idx, -1]
#         return features, label

In [46]:
class WaterDataset(Dataset):
    def __init__(self, csv_path):
        super().__init__()
        # Load data to pandas DataFrame
        df = pd.read_csv(csv_path).fillna(0)  # Fill NaN values with 0
        # Convert data to a NumPy array and assign to self.data
        self.data = df.to_numpy()
    
    # Implement __len__ to return the number of data samples
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        features = self.data[idx, :-1]
        # Assign last data column to label
        label = self.data[idx, -1]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

Next, let's feed it to the DataLoader to serve training data to the model!

**PyTorch DataLoader**

The next step in preparing the training data is to set up a DataLoader. A PyTorch DataLoader can be created from a Dataset to load data, split it into batches, and perform transformations on the data if desired. Then, it yields a data sample ready for training.

In this exercise, you will build a DataLoader based on the WaterDataset.

1. Create an instance of WaterDataset from water_train.csv, assigning it to dataset_train.
2. Create dataloader_train based on dataset_train, using a batch size of two and shuffling the samples.
3. Get a batch of features and labels from the DataLoader and print them.

In [73]:
# Create an instance of the WaterDataset
dataset = WaterDataset("/kaggle/input/water-protability/water_potability.csv")

# Split dataset into train and test sets
train_size = 0.8  # 80% training, 20% testing
test_size = 1 - train_size
dataset_train, dataset_test = train_test_split(dataset, test_size=test_size, random_state=42)

# Create a DataLoader based on dataset_train
dataloader_train = DataLoader(
    dataset_train,
    batch_size= 4,
    shuffle=True
)

dataloader_test = DataLoader(
    dataset_test,
    batch_size=4,
    shuffle=False  # No need to shuffle test data
)

print("Train Datalaoder")
# Get a batch of features and labels
features, labels = next(iter(dataloader_train))
print(f"features : {features}, \nLabels :{labels}")

print("\nTest Datalaoder")
features, labels = next(iter(dataloader_test))
print(f"features : {features}, \nLabels :{labels}")

Train Datalaoder
features : tensor([[2.9252e+00, 2.6046e+02, 3.9233e+04, 7.6701e+00, 3.0340e+02, 4.0324e+02,
         1.1309e+01, 6.9238e+01, 3.6786e+00],
        [6.8479e+00, 2.3494e+02, 2.4456e+04, 5.5962e+00, 3.3487e+02, 3.3757e+02,
         9.5153e+00, 7.7297e+01, 4.9212e+00],
        [0.0000e+00, 2.0281e+02, 1.1900e+04, 8.9481e+00, 3.6753e+02, 3.7956e+02,
         1.3592e+01, 5.8823e+01, 3.1538e+00],
        [7.7750e+00, 1.9518e+02, 2.2330e+04, 6.1051e+00, 4.1695e+02, 3.7162e+02,
         1.0964e+01, 0.0000e+00, 4.6995e+00]]), 
Labels :tensor([0., 1., 0., 0.])

Test Datalaoder
features : tensor([[0.0000e+00, 1.8352e+02, 2.0461e+04, 7.3332e+00, 3.3312e+02, 3.5637e+02,
         2.0179e+01, 6.7020e+01, 4.8866e+00],
        [6.6432e+00, 1.8891e+02, 3.2874e+04, 6.7915e+00, 3.3385e+02, 3.3656e+02,
         1.4707e+01, 6.7845e+01, 4.5622e+00],
        [7.8461e+00, 2.2406e+02, 2.3264e+04, 5.9224e+00, 3.0040e+02, 3.8797e+02,
         1.3407e+01, 4.3075e+01, 2.4880e+00],
        [7.1605e+00

You now have the data loaded and served for model training. It's time to build the model itself!

**PyTorch Model**

You will use the OOP approach to define the model architecture. Recall that this requires setting up a model class and defining two methods inside it:

.__init__(), in which you define the layers you want to use;

forward(), in which you define what happens to the model inputs once it receives them; this is where you pass inputs through pre-defined layers.

Let's build a model with three linear layers and ReLU activations. After the last linear layer, you need a sigmoid activation instead, which is well-suited for binary classification tasks like our water potability prediction problem. 

* In the .__init__() method, define the three linear layers with dimensions corresponding to the model definition provided and assign them to self.fc1, self.fc2, and self.fc3, respectively.
* In the forward() method, pass the model input x through all the layers, remembering to add activations on top of them, similarly how it's already done for the first layer.

In [18]:
model = nn.Sequential(
  nn.Linear(9, 16),
  nn.ReLU(),
  nn.Linear(16, 8),
  nn.ReLU(),
  nn.Linear(8, 1),
  nn.Sigmoid(),
)

In [74]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define the three linear layers
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
    def forward(self, x):
        # Pass x through linear layers adding activations
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [75]:
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=9, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=1, bias=True)
)


That's a neat model definition. Next,  model training, evaluation, and optimizers.

In [76]:
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr = 0.0001)

for epoch in range(10):
    for features, labels in dataloader_train:
        
        features = features.float()  # Ensure features are Float
        labels = labels.float()      # Ensure labels are Float
        
        optimizer.zero_grad()
        outputs = model(features)
        
        # Skip this batch if there are NaN values in the outputs
        if torch.isnan(outputs).any():
            print("NaN value found in outputs, skipping this batch")
            continue
        
        # Debugging prints
        print(f'Outputs: {outputs}')
        print(f'Labels: {labels}')
        
        # Reshape labels to match the outputs
        loss = criterion(outputs, labels.view(-1, 1))
        
        # Skip this batch if there are NaN values in the loss
        if torch.isnan(loss).any():
            print("NaN value found in loss, skipping this batch")
            continue
        
        loss.backward()
        optimizer.step()
        
        print(f'Loss: {loss.item()}')

Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([1., 0., 0., 0.])
Loss: 25.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([0., 1., 0., 0.])
Loss: 25.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([0., 0., 1., 1.])
Loss: 50.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([0., 0., 1., 0.])
Loss: 25.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([1., 1., 1., 0.])
Loss: 75.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([0., 1., 0., 0.])
Loss: 25.0
Outputs: tensor([[0.],
        [0.],
        [0.],
        [0.]], grad_fn=<SigmoidBackward0>)
Labels: tensor([0., 0., 1., 0.])
Loss: 25.0
Outputs: tensor([[0.],
        [0.

**Optimizers**

It's time to explore the different optimizers that you can use for training your model.

A custom function called train_model(optimizer, net, num_epochs) has been defined for you. It takes the optimizer, the model, and the number of epochs as inputs, runs the training loops, and prints the training loss at the end.

In [77]:
# Define the train_model function
def train_model(optimizer, net, num_epochs):
    criterion = nn.BCELoss()
    
    for epoch in range(num_epochs):
        net.train()  # Set the model to training mode
        running_loss = 0.0
        
        for features, labels in dataloader_train:
            features = features.float()  # Ensure features are Float
            labels = labels.float()      # Ensure labels are Float
            
            optimizer.zero_grad()
            outputs = net(features)
            
            loss = criterion(outputs, labels.view(-1, 1))
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * features.size(0)
        
        epoch_loss = running_loss / len(dataset_train)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')
    
    print('Finished Training')

**Define the optimizer as Stochastic Gradient Descent.**

In [78]:
net = Net()

# Define the SGD optimizer
optimizer1 = optim.SGD(net.parameters(), lr=0.001)

train_model(
    optimizer=optimizer1,
    net=net,
    num_epochs=10,
)

Epoch [1/10], Loss: 46.5578
Epoch [2/10], Loss: 39.4171
Epoch [3/10], Loss: 39.4171
Epoch [4/10], Loss: 39.4171
Epoch [5/10], Loss: 39.4171
Epoch [6/10], Loss: 39.4171
Epoch [7/10], Loss: 39.4171
Epoch [8/10], Loss: 39.4171
Epoch [9/10], Loss: 39.4171
Epoch [10/10], Loss: 39.4171
Finished Training


**Define the optimizer as Root Mean Square Propagation (RMSprop), passing the model's parameters as its first argument.**

In [79]:
# Define the RMSprop optimizer
optimizer2 = optim.RMSprop(model.parameters(), lr=0.0001)

train_model(
    optimizer=optimizer2,
    net=net,
    num_epochs=10,
)

Epoch [1/10], Loss: 39.4171
Epoch [2/10], Loss: 39.4171
Epoch [3/10], Loss: 39.4171
Epoch [4/10], Loss: 39.4171
Epoch [5/10], Loss: 39.4171
Epoch [6/10], Loss: 39.4171
Epoch [7/10], Loss: 39.4171
Epoch [8/10], Loss: 39.4171
Epoch [9/10], Loss: 39.4171
Epoch [10/10], Loss: 39.4171
Finished Training


In [80]:
# Define the Adam optimizer
optimizer3 = optim.Adam(net.parameters(), lr = 0.001)

train_model(
    optimizer=optimizer3,
    net=net,
    num_epochs=10,
)

Epoch [1/10], Loss: 39.2059
Epoch [2/10], Loss: 39.2916
Epoch [3/10], Loss: 45.3930
Epoch [4/10], Loss: 60.5344
Epoch [5/10], Loss: 60.5344
Epoch [6/10], Loss: 60.5344
Epoch [7/10], Loss: 60.5344
Epoch [8/10], Loss: 60.5344
Epoch [9/10], Loss: 60.5344
Epoch [10/10], Loss: 60.5344
Finished Training


Model training has some randomness to it and each time you get slightly different results, but it's very likely that you saw RMSprop and Adam decreasing the loss more than a simple SGD even after just 10 training epochs.

# Model evaluation

You can now evaluate the model on test data. To do this, you will need to write the evaluation loop to iterate over the batches of test data, get the model's predictions for each batch, and calculate the accuracy score for it. Let's do it!

* Set up the evaluation metric as Accuracy for binary classification and assign it to acc.
* For each batch of test data, get the model's outputs and assign them to outputs.
* After the loop, compute the total test accuracy and assign it to test_accuracy.

In [81]:
# Set up binary accuracy metric
acc = Accuracy(task = "binary")

net.eval()
with torch.no_grad():
    for features, labels in dataloader_test:
        # Get predicted probabilities for test data batch
        outputs = net(features)
        preds = (outputs >= 0.5).float()
        acc(preds, labels.view(-1, 1))

# Compute total test accuracy
test_accuracy = acc.compute()
print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.37195122241973877
