In [1]:
# Import Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import math

In [2]:
class ReLU_full_grad(torch.autograd.Function):
    """ ReLU activation function that passes through the gradient irrespective of its input value. """

    @staticmethod
    def forward(ctx, input):
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.clone()


class TDistributionActivation(torch.autograd.Function):
    """Custom activation function based on the negative log of the density under a t-distribution."""

    @staticmethod
    def forward(ctx, input, nu=10):
        """
        Forward pass of the t-distribution-based activation function.
        Args:
            input: Input tensor.
            nu: Degrees of freedom for the t-distribution (default: 10).
        Returns:
            Tensor after applying the activation function.
        """
        ctx.nu = nu
        # precompute constants
        constant_term = (
            math.log(math.sqrt(nu * math.pi)) +
            torch.lgamma(torch.tensor(nu / 2.0)) -
            torch.lgamma(torch.tensor(nu + 1.0) / 2.0)
        )
        # compute activation
        activation = constant_term + ((nu + 1) / 2.0) * torch.log(1 + (input ** 2) / nu)
        ctx.save_for_backward(input)
        return activation

    @staticmethod
    def backward(ctx, grad_output):
        """
        Backward pass of the activation function.
        Args:
            grad_output: Gradient of the loss with respect to the output.
        Returns:
            Gradient of the loss with respect to the input.
        """
        input, = ctx.saved_tensors
        nu = ctx.nu

        # Compute the gradient
        grad_input = grad_output * ((nu + 1) * input) / (nu + input ** 2)
        return grad_input, None  # Return None for `nu` since it's not trainable


In [3]:
# Define the Backpropagation Model
class BPModel(nn.Module):
    def __init__(self, input_dim=3072, hidden_dim=1000, num_layers=3, activation="relu", num_classes=10):
        super(BPModel, self).__init__()

        # Dynamically select activation function
        if activation == "relu":
            self.act_fn = nn.ReLU()
        elif activation == "leaky_relu":
            self.act_fn = nn.LeakyReLU(negative_slope=0.01)
        elif activation == "sigmoid":
            self.act_fn = nn.Sigmoid()
        elif activation == "tanh":
            self.act_fn = nn.Tanh()
        elif activation == "relu_full_grad":
            self.act_fn = ReLU_full_grad()
        elif activation == "t_distribution_activation":
            self.act_fn = TDistributionActivation()
        else:
            raise ValueError(f"Unknown activation function: {activation}")

        # Define network architecture
        self.layers = nn.ModuleList([nn.Linear(input_dim, hidden_dim)])
        for _ in range(1, num_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten input for linear layers
        for layer in self.layers:
            if isinstance(self.act_fn, torch.autograd.Function):
                x = self.act_fn.apply(layer(x))
            else:
                x = self.act_fn(layer(x))
        return self.classifier(x)

In [4]:
# Training Function
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metrics
        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(train_loader), accuracy

In [5]:
# Validation Function
def validate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Metrics
            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(test_loader), accuracy




In [14]:
# Main Function to Train and Evaluate with Different Activations
def run_experiment(activations, num_epochs=10, batch_size=128, hidden_dim=1000, num_layers=3, learning_rate=0.01):
    results = []

    # CIFAR-10 Data
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    ])
    train_dataset = datasets.CIFAR10(root="./datasets", train=True, transform=transform, download=True)
    val_dataset = datasets.CIFAR10(root="./datasets", train=False, transform=transform, download=True)  # For validation
    test_dataset = datasets.CIFAR10(root="./datasets", train=False, transform=transform, download=True)  # For test
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for activation in activations:
        print(f"Running experiment with activation: {activation}")
        model = BPModel(hidden_dim=hidden_dim, num_layers=num_layers, activation=activation).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=3e-4)

        train_losses, val_losses, test_losses = [], [], []
        train_accuracies, val_accuracies, test_accuracies = [], [], []

        for epoch in range(1, num_epochs + 1):
            train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, device)
            train_losses.append(train_loss)
            train_accuracies.append(train_acc)
            
            val_loss, val_acc = validate_model(model, test_loader, criterion, device)
            val_losses.append(val_loss)
            val_accuracies.append(val_acc)
            

            print(f"Epoch {epoch}/{num_epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
                  f"val_Loss: {val_loss:.4f}, val_Acc: {val_acc:.4f}")
        
        
        
        test_loss, test_acc = validate_model(model, test_loader, criterion, device)
        test_losses.append(test_loss)
        test_accuracies.append(test_acc)
        print(f"validation_loss: {sum(val_losses)/len(val_losses):.4f}, {sum(val_accuracies)/len(val_accuracies):.4f}")
        print(f"Final Test Metrics for activation {activation} - Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

        
        
        results.append({
            "activation": activation,
            "train_losses": train_losses,
            "val_losses": val_losses,
            "test_losses": test_losses,
            "train_accuracies": train_accuracies,
            "val_accuracies": val_accuracies,
            "test_accuracies": test_accuracies
        })

    return results


In [15]:
# # Plot Results
# def plot_results(results):
#     for result in results:
#         activation = result["activation"]
# 
#         # Plot Loss
#         plt.figure()
#         plt.plot(result["train_losses"], label="Train Loss")
#         plt.plot(result["val_losses"], label="Val Loss")
#         plt.title(f"Loss vs Epochs ({activation})")
#         plt.xlabel("Epochs")
#         plt.ylabel("Loss")
#         plt.legend()
#         plt.show()
# 
#         # Plot Accuracy
#         plt.figure()
#         plt.plot(result["train_accuracies"], label="Train Accuracy")
#         plt.plot(result["val_accuracies"], label="Val Accuracy")
#         plt.title(f"Accuracy vs Epochs ({activation})")
#         plt.xlabel("Epochs")
#         plt.ylabel("Accuracy")
#         plt.legend()
#         plt.show()
import plotly.graph_objects as go

def plot_results(results):
    """
    Generate interactive plots for loss and accuracy using Plotly.
    
    Args:
    - results: List of dictionaries containing results for each experiment.
               Each dictionary should have "activation", "train_losses", "val_losses",
               "train_accuracies", and "val_accuracies" keys.
    """
    # Loss Plot
    loss_fig = go.Figure()
    for result in results:
        activation = result["activation"]
        
        loss_fig.add_trace(
            go.Scatter(
                x=list(range(len(result["train_losses"]))),
                y=result["train_losses"],
                mode="lines",
                name=f"{activation} - Train Loss"
            )
        )
        loss_fig.add_trace(
            go.Scatter(
                x=list(range(len(result["val_losses"]))),
                y=result["val_losses"],
                mode="lines",
                name=f"{activation} - Val Loss"
            )
        )
    
    loss_fig.update_layout(
        title="Loss vs Epochs",
        xaxis_title="Epochs",
        yaxis_title="Loss",
        legend_title="Legend",
        template="plotly_dark"  # Optional: Use a dark theme for aesthetics
    )
    loss_fig.show()

    # Accuracy Plot
    accuracy_fig = go.Figure()
    for result in results:
        activation = result["activation"]
        
        accuracy_fig.add_trace(
            go.Scatter(
                x=list(range(len(result["train_accuracies"]))),
                y=result["train_accuracies"],
                mode="lines+markers",
                name=f"{activation} - Train Accuracy"
            )
        )
        accuracy_fig.add_trace(
            go.Scatter(
                x=list(range(len(result["val_accuracies"]))),
                y=result["val_accuracies"],
                mode="lines+markers",
                name=f"{activation} - Val Accuracy"
            )
        )
    
    accuracy_fig.update_layout(
        title="Accuracy vs Epochs",
        xaxis_title="Epochs",
        yaxis_title="Accuracy",
        legend_title="Legend",
        template="plotly_dark"  # Optional: Use a clean white theme
    )
    accuracy_fig.show()


In [16]:
# Run Experiments
activations_to_test = ["relu", "leaky_relu", "sigmoid", "tanh","relu_full_grad", "t_distribution_activation"]
experiment_results = run_experiment(activations=activations_to_test, num_epochs=20)


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Running experiment with activation: relu
Epoch 1/20 - Train Loss: 1.7467, Train Acc: 0.3803, val_Loss: 1.5189, val_Acc: 0.4627
Epoch 2/20 - Train Loss: 1.4491, Train Acc: 0.4877, val_Loss: 1.4096, val_Acc: 0.5052
Epoch 3/20 - Train Loss: 1.3171, Train Acc: 0.5382, val_Loss: 1.3487, val_Acc: 0.5225
Epoch 4/20 - Train Loss: 1.2205, Train Acc: 0.5714, val_Loss: 1.3441, val_Acc: 0.5228
Epoch 5/20 - Train Loss: 1.1260, Train Acc: 0.6047, val_Loss: 1.3009, val_Acc: 0.5408
Epoch 6/20 - Train Loss: 1.0495, Train Acc: 0.6294, val_Loss: 1.3038, val_Acc: 0.5437
Epoch 7/20 - Train Loss: 0.9664, Train Acc: 0.6581, val_Loss: 1.3395, val_Acc: 0.5395
Epoch 8/20 - Train Loss: 0.8923, Train Acc: 0.6851, val_Loss: 1.3944, val_Acc: 0.5421
Epoch 9/20 - Train Loss: 0.8194, Train Acc: 0.7116, val_Loss: 1.3243, val_Acc: 0.5556
Epoch 10/20 - Train Loss: 0.7373, Train Acc: 0.7402, val_Loss: 1.4025, 

  self.act_fn = ReLU_full_grad()


Epoch 1/20 - Train Loss: 1.8228, Train Acc: 0.3540, val_Loss: 1.7186, val_Acc: 0.3988
Epoch 2/20 - Train Loss: 1.7168, Train Acc: 0.3975, val_Loss: 1.6829, val_Acc: 0.4077
Epoch 3/20 - Train Loss: 1.7038, Train Acc: 0.4049, val_Loss: 1.7242, val_Acc: 0.4038
Epoch 4/20 - Train Loss: nan, Train Acc: 0.2363, val_Loss: nan, val_Acc: 0.1000
Epoch 5/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 6/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 7/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 8/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 9/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 10/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 11/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 12/20 - Train Loss: nan, Train Acc: 0.1000, val_Loss: nan, val_Acc: 0.1000
Epoch 13/20 - Train

  self.act_fn = TDistributionActivation()


Epoch 1/20 - Train Loss: 2.3644, Train Acc: 0.1866, val_Loss: 2.4759, val_Acc: 0.2411
Epoch 2/20 - Train Loss: 2.0202, Train Acc: 0.3178, val_Loss: 2.2369, val_Acc: 0.3190
Epoch 3/20 - Train Loss: 1.8375, Train Acc: 0.3887, val_Loss: 2.0409, val_Acc: 0.3252
Epoch 4/20 - Train Loss: 1.7217, Train Acc: 0.4332, val_Loss: 1.8130, val_Acc: 0.4195
Epoch 5/20 - Train Loss: 1.5958, Train Acc: 0.4747, val_Loss: 2.0049, val_Acc: 0.4698
Epoch 6/20 - Train Loss: 1.6102, Train Acc: 0.4964, val_Loss: 1.8420, val_Acc: 0.4296
Epoch 7/20 - Train Loss: 1.4602, Train Acc: 0.5251, val_Loss: 1.7307, val_Acc: 0.4883
Epoch 8/20 - Train Loss: 1.5148, Train Acc: 0.5294, val_Loss: 1.8387, val_Acc: 0.4768
Epoch 9/20 - Train Loss: 25406.0761, Train Acc: 0.4040, val_Loss: 167326.5376, val_Acc: 0.1000
Epoch 10/20 - Train Loss: 174741.3698, Train Acc: 0.1001, val_Loss: 151685.5094, val_Acc: 0.1000
Epoch 11/20 - Train Loss: 213352.1545, Train Acc: 0.0994, val_Loss: 196386.1637, val_Acc: 0.1000
Epoch 12/20 - Train Los

In [10]:
import pandas as pd
experiment_results = pd.DataFrame(experiment_results)
experiment_results.head()

Unnamed: 0,activation,train_losses,val_losses,test_losses,train_accuracies,val_accuracies,test_accuracies
0,relu,"[1.754270024311817, 1.449570551857619, 1.32029...","[1.5562874757790868, 1.4024028566819202, 1.373...",[],"[0.37574, 0.48812, 0.53532, 0.57078, 0.60292, ...","[0.46, 0.5023, 0.5164, 0.5315, 0.5323, 0.5324,...",[]
1,leaky_relu,"[1.7521490761081275, 1.4504411052864836, 1.318...","[1.5506830562519123, 1.4220768032194693, 1.356...",[],"[0.37638, 0.48856, 0.5329, 0.57092, 0.59952, 0...","[0.451, 0.4986, 0.5242, 0.531, 0.5315, 0.5435,...",[]
2,sigmoid,"[2.3350579305682952, 2.1570179450237537, 2.045...","[2.2905006710487075, 2.086325692225106, 1.9975...",[],"[0.10582, 0.16752, 0.21536, 0.2594, 0.2954, 0....","[0.1, 0.181, 0.2495, 0.2897, 0.2846, 0.3188, 0...",[]
3,tanh,"[1.7889854737254969, 1.6404871141819088, 1.555...","[1.6941428561753864, 1.6255174347116976, 1.590...",[],"[0.37308, 0.43228, 0.46314, 0.48692, 0.51022, ...","[0.4091, 0.4368, 0.4523, 0.4638, 0.4703, 0.478...",[]
4,relu_full_grad,"[1.8257401355392182, 1.7182722997177593, 1.708...","[1.7612814661822742, 1.6772233806078947, 1.734...",[],"[0.3526, 0.39994, 0.40214, 0.21452, 0.1, 0.1, ...","[0.3828, 0.4135, 0.397, 0.1, 0.1, 0.1, 0.1, 0....",[]


In [21]:
plot_results(experiment_results)