## Importing the Dataset

In [181]:
from torchvision import datasets, transforms
import numpy as np

#
transform = transforms.ToTensor()
#
train_dataset = datasets.FashionMNIST (root='data', train=True, transform=transform, download=False)
#
test_dataset = datasets.FashionMNIST (root='data', train=False, transform=transform, download=False)



Checking properties of the datset

In [182]:
print("Training set size:", len(train_dataset))
print("Test set size:", len(test_dataset))
print("Image size:", train_dataset[0][0].shape)  # shape of the first image
print("Number of classes:", len(train_dataset.classes))
print("Class labels:", train_dataset.classes)

Training set size: 60000
Test set size: 10000
Image size: torch.Size([1, 28, 28])
Number of classes: 10
Class labels: ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


Trying with pytorch

In [183]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
from torch.utils.data import DataLoader, random_split
# import torch.nn.init as init

# Split train dataset into training and validation sets
# DataLoader for train, validation, and test sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# # Define the model (your previously defined FNN with 3 layers, batch normalization, ReLU, etc.)
# class FNN(nn.Module):
#     def __init__(self):
#         super(FNN, self).__init__()
#         self.layer1 = nn.Sequential(
#             nn.Linear(784, 256),
#             nn.BatchNorm1d(256),
#             nn.ReLU(),
#             nn.Dropout(0.2)
#         )
#         self.layer2 = nn.Sequential(
#             nn.Linear(256, 128),
#             nn.BatchNorm1d(128),
#             nn.ReLU(),
#             nn.Dropout(0.2)
#         )
#         self.layer3 = nn.Sequential(
#             nn.Linear(128, 10),
#             nn.Softmax(dim=1)
#         )

#         # Apply Xavier initialization
#         init.xavier_uniform_(self.layer1[0].weight)
#         init.xavier_uniform_(self.layer2[0].weight)
#         init.xavier_uniform_(self.layer3[0].weight)
        
#         # Apply bias initialization (optional but can improve training)
#         init.zeros_(self.layer1[0].bias)
#         init.zeros_(self.layer2[0].bias)
#         init.zeros_(self.layer3[0].bias)

#     def forward(self, x):
#         x = x.view(-1, 784)  # Flatten the images
#         x = self.layer1(x)
#         x = self.layer2(x)
#         x = self.layer3(x)
#         return x

# # Initialize model, loss function, and optimizer
# model = FNN()
# # Hyperparameters
# num_epochs = 10
# learning_rate = 0.001

# # Define loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# # Training loop
# for epoch in range(num_epochs):
#     model.train()  # Set the model to training mode
#     running_loss = 0.0
    
#     for images, labels in train_loader:
#         # Zero the parameter gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         outputs = model.forward(images)
#         loss = criterion(outputs, labels)
        
#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()
        
#         # Accumulate running loss
#         running_loss += loss.item()
    
#     # Print average loss for the epoch
#     average_loss = running_loss / len(train_loader)
#     print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}")

# # Evaluate on test set
# model.eval()  # Switch to evaluation mode
# correct = 0
# total = 0
# test_loss = 0.0

# with torch.no_grad():  # Disable gradient computation
#     for images, labels in test_loader:
#         outputs = model.forward(images)
#         loss = criterion(outputs, labels)
#         test_loss += loss.item()
        
#         # Get predictions
#         _, predicted = torch.max(outputs, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

# # Calculate and print the test accuracy and loss
# average_test_loss = test_loss / len(test_loader)
# test_accuracy = correct / total
# print(f"Test Loss: {average_test_loss:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")

Implementation of dense layer

In [184]:
class DenseLayer:
    def __init__(self, nX, nY):
        self.W = np.random.randn(nY, nX)
        self.B = np.zeros(nY)

    def forward (self, X_in):
        self.X = X_in
        Y = np.matmul (self.X, np.transpose(self.W)) + self.B
        if np.isnan(Y).any():  # Check for NaNs
            print("NaN detected in DenseLayer forward pass")
        return Y
    
    def backward (self, G_y, alpha):
        G_x = np.matmul (G_y, self.W)
        G_w = np.matmul (np.transpose(G_y), self.X)
        G_b = np.sum (G_y, axis=0)

        # updating W and B
        self.W = self.W - alpha * G_w
        self.B = self.B = alpha * G_b

        if np.isnan(self.W).any() or np.isnan(self.B).any():  # Check for NaNs in parameters
            print("NaN detected in DenseLayer weights/biases after update")
        
        # passing derivative to prev layer
        return G_x
   


Implementation of ReLU Activation function

In [185]:
class ReLU:
    def forward (self, X_in):
        self.X = X_in
        return np.maximum (self.X, 0)
    
    def backward (self, G_y, alpha):
        G_x = G_y * (self.X >= 0)
        return G_x

Implementation of Softmax loss function

In [186]:
class Softmax:
    def forward (self, X_in):
        self.X = X_in
        shiftX = np.max(self.X, axis=1, keepdims=True)
        numerators = np.exp(self.X- shiftX)
        denominator = np.sum(numerators, axis=1, keepdims=True)
        self.output = numerators / denominator 
        return self.output

    def backward (self, Y, alpha):
        G_x = Y
        return G_x


Implementation of Dropout Layer

In [187]:
class Dropout:
    def __init__ (self, probability=0.35):
        self.probability = probability
    def forward (self, X, train=True):
        if train:
            prob_arr = np.random.rand(X.shape[0], X.shape[1])
            self.mask = (prob_arr >= self.probability)
            X_sel_scaled = X * self.mask / (1 - self.probability)
            return X_sel_scaled
        return X #when testing
    
    def backward (self, G_y, alpha):
        G_x = G_y * self.mask / (1 - self.probability)
        return G_x
        

Sequential layer stacking mechanism

In [188]:
class Sequential:
    def __init__ (self, *layers):
        self.layers = layers

    def forward (self, X):
        for layer in self.layers:
            X = layer.forward(X)
        return X
    
    def backward (self, G, alpha):
        for layer in reversed(self.layers):
            G = layer.backward (G, alpha)
        return G


FNN Class

In [189]:
class FNN:
    def __init__(self):
        self.model = Sequential(
            DenseLayer(784, 128),
            # MyBatchNorm(128),
            ReLU(),
            Dropout(0.5),
            DenseLayer(128, 64),
            # MyBatchNorm(64),
            ReLU(),
            Dropout(0.5),
            DenseLayer(64, 10),
            Softmax()
        )

    def forward(self, X):
        return self.model.forward(X)

    def backward(self, grad_output, learning_rate):
        return self.model.backward(grad_output, learning_rate)

In [190]:
# Hyperparameters
learning_rate = 0.01
epochs = 10

# Initialize the model and loss function
model = FNN()
loss_fn = lambda pred, true: -np.mean(np.sum(true * np.log(pred), axis=1))  # Cross-entropy loss

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.view(X_batch.size(0), -1).numpy()  # Flatten and convert to NumPy array
        y_batch_one_hot = np.eye(10)[y_batch.numpy()]  # One-hot encoding of labels

        # Forward pass
        preds = model.forward(X_batch)
        
        # Compute loss
        loss = loss_fn(preds, y_batch_one_hot)
        epoch_loss += loss
        
        # Backward pass
        grad_output = preds - y_batch_one_hot  # Gradient of cross-entropy loss wrt softmax output
        model.backward(grad_output, learning_rate)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

# Testing loop
correct = 0
total = 0
for X_batch, y_batch in test_loader:
    X_batch = X_batch.view(X_batch.size(0), -1).numpy()  # Flatten and convert to NumPy array
    preds = model.forward(X_batch)
    predictions = np.argmax(preds, axis=1)

    total += y_batch.size(0)
    correct += (predictions == y_batch.numpy()).sum()

# Print the test accuracy
accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

  loss_fn = lambda pred, true: -np.mean(np.sum(true * np.log(pred), axis=1))  # Cross-entropy loss
  loss_fn = lambda pred, true: -np.mean(np.sum(true * np.log(pred), axis=1))  # Cross-entropy loss


  G_x = np.matmul (G_y, self.W)
  G_x = G_y * self.mask / (1 - self.probability)


NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer weights/biases after update
NaN detected in DenseLayer forward pass
NaN detected in DenseLayer forward pass
NaN detect