In [13]:
#importing libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
#loading dataset
df = pd.read_csv('abalone.data.csv')

In [15]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [16]:
# Performing one-hot encoding for the 'Sex' column
df = pd.get_dummies(df, columns=['Sex'],dtype=int)

In [17]:
df

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,1,0,0


In [18]:
# Standardize continuous features
scaler = StandardScaler()
continuous_columns = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
df[continuous_columns] = scaler.fit_transform(df[continuous_columns])

In [19]:
#defining dependent and independent variables
X = df.drop("Rings", axis=1).values
y = df["Rings"].values

In [20]:
# splitting training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#converting to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32, requires_grad = True)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32,requires_grad = True)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32,requires_grad = True)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32,requires_grad = True)

In [38]:
# first nn model
class BPN(nn.Module):
    def __init__(self):
        super(BPN, self).__init__()
        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [39]:
# defining SGD optimiser and MSE criterion
model = BPN()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [40]:
# Function to train model
def train_model(model, criterion, optimizer, X_train, y_train, epochs=100):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs=model(X_train)
        loss= criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

#training model
train_model(model, criterion, optimizer, X_train_tensor, y_train_tensor)

Epoch [1/100], Loss: 111.34432983398438
Epoch [11/100], Loss: 13.832454681396484
Epoch [21/100], Loss: 6.908550262451172
Epoch [31/100], Loss: 5.8238301277160645
Epoch [41/100], Loss: 5.397177696228027
Epoch [51/100], Loss: 5.134884357452393
Epoch [61/100], Loss: 4.965994358062744
Epoch [71/100], Loss: 4.854366302490234
Epoch [81/100], Loss: 4.778977870941162
Epoch [91/100], Loss: 4.725654125213623


In [25]:
# evaluation of model
def evaluate(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        mse = nn.MSELoss()
        loss = mse(outputs, y_test)
        return loss.item()

mse = evaluate(model, X_test_tensor, y_test_tensor)
print("Mean Squared Error (MSE) on test set:", mse)

Mean Squared Error (MSE) on test set: 16.37147331237793


  return F.mse_loss(input, target, reduction=self.reduction)


In [26]:
# trying different hyperparameters
learning_rates = [0.01, 0.05, 0.07]
hidden_nodes = [32, 64, 128]
hidden_layers = [1, 2, 3]

results = []

for lr in learning_rates:
        for num_nodes in hidden_nodes:
            for num_layers in hidden_layers:
                model = nn.Sequential(
                    nn.Linear(10, num_nodes),
                    nn.ReLU(),
                )
                for _ in range(num_layers - 1):
                    model.add_module(f"hidden_{_+1}", nn.Linear(num_nodes, num_nodes))
                    model.add_module(f"activation_{_+1}", nn.ReLU())
                model.add_module("output", nn.Linear(num_nodes, 1))


                optimizer = optim.SGD(model.parameters(), lr=lr)
                criterion = nn.MSELoss()

                train_losses = []
                for epoch in range(100):
                    optimizer.zero_grad()
                    outputs = model(X_train_tensor)
                    loss = criterion(outputs.squeeze(), y_train_tensor)
                    loss.backward()
                    optimizer.step()
                    train_losses.append(loss.item())

                model.eval()
                with torch.no_grad():
                    outputs = model(X_test_tensor)
                    mse_test_loss = criterion(outputs.squeeze(), y_test_tensor).item()

                results.append({
                    'Learning Rate': lr,
                    'Hidden Nodes': num_nodes,
                    'Hidden Layers': num_layers,
                    'MSE Test Loss': mse_test_loss,
                    'Train Losses': train_losses
                })


results_df = pd.DataFrame(results)
print(results_df)

    Learning Rate  Hidden Nodes  Hidden Layers  MSE Test Loss  \
0            0.01            32              1       5.077224   
1            0.01            32              2       4.929773   
2            0.01            32              3       4.773942   
3            0.01            64              1       5.115112   
4            0.01            64              2       4.965305   
5            0.01            64              3       5.073771   
6            0.01           128              1       4.946971   
7            0.01           128              2       5.039757   
8            0.01           128              3       5.714552   
9            0.05            32              1       5.198081   
10           0.05            32              2       5.294216   
11           0.05            32              3       5.669549   
12           0.05            64              1       5.465923   
13           0.05            64              2       5.215132   
14           0.05        

We observe that the loss using learning rate 0.01 is the lowest. Loss is also low for 128 hidden nodes and 2 hidden layers.
If the learning rate is too low, the optimization process may be slow, and the model may take a long time to converge to the optimal solution. Increasing the learning rate can speed up the convergence process. Decreasing the learning rate can make the optimization process more stable and prevent overshooting or divergence.
Adding more hidden layers allows the neural network to learn increasingly complex patterns and relationships in the data. This can potentially improve the model's ability to capture intricate features and achieve higher performance.Decreasing the number of hidden layers may simplify the model and reduce its capacity to learn complex patterns. This can make the model more prone to underfitting, where it fails to capture important patterns in the data.

In [47]:
# Adagrad
model = BPN()
criterion = nn.MSELoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

In [50]:
#Adagrad
def train_model(model, criterion, optimizer, X_train, y_train, epochs=100):
  for epoch in range(epochs):
        # Forward pass
        model.train()
        optimizer.zero_grad()
        outputs=model(X_train)
        loss= criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

train_model(model, criterion, optimizer, X_train_tensor, y_train_tensor)

Epoch [1/100], Loss: 5.099472999572754
Epoch [11/100], Loss: 5.053106784820557
Epoch [21/100], Loss: 5.012213230133057
Epoch [31/100], Loss: 4.9745259284973145
Epoch [41/100], Loss: 4.939453125
Epoch [51/100], Loss: 4.907494068145752
Epoch [61/100], Loss: 4.878143787384033
Epoch [71/100], Loss: 4.851113796234131
Epoch [81/100], Loss: 4.825852870941162
Epoch [91/100], Loss: 4.802338123321533


In [51]:
# COmparing SGD and AdaGrad optimizers' performance
learning_rates = [0.01, 0.05]
hidden_nodes = [32, 64,]
hidden_layers = [2,5,10]

results_sgd = []
results_adagrad = []

for lr in learning_rates:
        for num_nodes in hidden_nodes:
            for num_layers in hidden_layers:
                model = nn.Sequential(
                    nn.Linear(10, num_nodes),
                    nn.ReLU()
                )
                for _ in range(num_layers - 1):
                    model.add_module(f"hidden_{_+1}", nn.Linear(num_nodes, num_nodes))
                    model.add_module(f"activation_{_+1}", nn.ReLU())
                model.add_module("output", nn.Linear(num_nodes, 1))

                # SGD
                optimizer_sgd = optim.SGD(model.parameters(), lr=lr)
                criterion = nn.MSELoss()

                train_losses_sgd = []
                for epoch in range(100):
                    model.train()
                    optimizer_sgd.zero_grad()
                    outputs = model(X_train_tensor)
                    loss = criterion(outputs.squeeze(), y_train_tensor)
                    loss.backward()
                    optimizer_sgd.step()
                    train_losses_sgd.append(loss.item())

                model.eval()
                with torch.no_grad():
                    outputs = model(X_test_tensor)
                    mse_test_loss_sgd = criterion(outputs.squeeze(), y_test_tensor).item()

                results_sgd.append({
                    'Learning Rate': lr,
                    'Hidden Nodes': num_nodes,
                    'Hidden Layers': num_layers,
                    'MSE Test Loss': mse_test_loss_sgd,
                    'Train Losses': train_losses_sgd
                })

                # Adagrad
                optimizer_adagrad = optim.Adagrad(model.parameters(), lr=lr)

                train_losses_adagrad = []
                for epoch in range(100):
                    model.train()
                    optimizer_adagrad.zero_grad()
                    outputs = model(X_train_tensor)
                    loss = criterion(outputs.squeeze(), y_train_tensor)
                    loss.backward()
                    optimizer_adagrad.step()
                    train_losses_adagrad.append(loss.item())


                model.eval()
                with torch.no_grad():
                    outputs = model(X_test_tensor)
                    mse_test_loss_adagrad = criterion(outputs.squeeze(), y_test_tensor).item()


                results_adagrad.append({
                    'Learning Rate': lr,
                    'Hidden Nodes': num_nodes,
                    'Hidden Layers': num_layers,
                    'MSE Test Loss': mse_test_loss_adagrad,
                    'Train Losses': train_losses_adagrad
                })


results_df_sgd = pd.DataFrame(results_sgd)
print("Results for SGD Optimizer:")
print(results_df_sgd)

results_df_adagrad = pd.DataFrame(results_adagrad)
print("Results for Adagrad Optimizer:")
print(results_df_adagrad)


Results for SGD Optimizer:
    Learning Rate  Hidden Nodes  Hidden Layers  MSE Test Loss  \
0            0.01            32              2       5.089727   
1            0.01            32              5       6.010038   
2            0.01            32             10      10.248506   
3            0.01            64              2       4.859236   
4            0.01            64              5       5.732605   
5            0.01            64             10      10.803572   
6            0.05            32              2       6.278651   
7            0.05            32              5       7.368018   
8            0.05            32             10      10.828102   
9            0.05            64              2            NaN   
10           0.05            64              5       6.014714   
11           0.05            64             10      10.827590   

                                         Train Losses  
0   [110.48181915283203, 105.25165557861328, 99.92...  
1   [104.083320

Adagrad is an adaptive learning rate optimization algorithm that adjusts the learning rates of each parameter based on the historical gradients. It has the advantage of automatically adapting the learning rates to different parameters, which can be beneficial for training deep neural networks. This is why we can observe the loss by using adagrad is lower than that of SGD.

In [30]:
#defining model with sigmoid activation function and 10 hidden layers
class AbaloneModelSigmoid(nn.Module):
    def __init__(self):
        super(AbaloneModelSigmoid, self).__init__()
        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 32)
        self.fc4 = nn.Linear(32, 32)
        self.fc5 = nn.Linear(32, 32)
        self.fc6 = nn.Linear(32, 32)
        self.fc7 = nn.Linear(32, 32)
        self.fc8 = nn.Linear(32, 32)
        self.fc9 = nn.Linear(32, 32)
        self.fc10 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        x = self.sigmoid(self.fc5(x))
        x = self.sigmoid(self.fc6(x))
        x = self.sigmoid(self.fc7(x))
        x = self.sigmoid(self.fc8(x))
        x = self.sigmoid(self.fc9(x))
        x = self.fc10(x)
        return x

model_sigmoid = AbaloneModelSigmoid()
optimizer_sigmoid = optim.SGD(model_sigmoid.parameters(), lr=0.01)

train_model(model_sigmoid, criterion, optimizer_sigmoid, X_train_tensor, y_train_tensor)


Epoch [1/100], Loss: 106.46713256835938
Epoch [11/100], Loss: 10.636360168457031
Epoch [21/100], Loss: 10.284177780151367
Epoch [31/100], Loss: 10.283974647521973
Epoch [41/100], Loss: 10.28397274017334
Epoch [51/100], Loss: 10.28397274017334
Epoch [61/100], Loss: 10.283971786499023
Epoch [71/100], Loss: 10.283971786499023
Epoch [81/100], Loss: 10.283971786499023
Epoch [91/100], Loss: 10.283971786499023


The loss observed by using sigmoid activation function is higher than that of ReLu. ReLU can overcome both of the disadvantages found in sigmoid functions. It avoids the vanishing gradient problem since it has a constant gradient of 1 for all the positive inputs. The flow of gradients backward during backpropagation becomes easier, and the training becomes more effective.
The vanishing gradient problem occurs when gradients become extremely small as they propagate backward through the layers of a deep neural network during training. Sigmoid activation functions, particularly when used in deeper networks, are prone to this issue because they saturate for large positive or negative inputs, resulting in gradients close to zero. Using sigmoid activation functions in deeper networks can exacerbate the vanishing gradient problem, leading to slower convergence and potentially poorer performance. We observe the vanishing gradient problem here since there is little to no change in the loss with each iteration.