In [8]:
import torch
import random

torch.manual_seed(0)
random.seed(0)

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f422c9f3940>

In [9]:
import os
print("Available cores:", os.cpu_count())

# Check and display the GPU environment details using TensorFlow
if torch.cuda.is_available():
    print("Available GPUs:", [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])
    print("Num GPUs Available: ", torch.cuda.device_count())
else:
    print("No GPUs available.")

Available cores: 20
Available GPUs: ['NVIDIA GeForce RTX 4070 Laptop GPU']
Num GPUs Available:  1


In [10]:
# Load dataset
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset

# Settings
dataset = "MNIST"
# dataset = "FashionMNIST"
# dataset = "CIFAR10"

if dataset == "MNIST":
    # Load MNIST dataset
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,)),
    ])
    train_dataset = datasets.MNIST(root='../data', train=True, transform=transform, download=True)
    test_dataset = datasets.MNIST(root='../data', train=False, transform=transform, download=True)

elif dataset == "FashionMNIST":
    # Load Fashion-MNIST
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)), 
    ])
    train_dataset = datasets.FashionMNIST(root='../data', train=True, transform=transform, download=True)
    test_dataset = datasets.FashionMNIST(root='../data', train=False, transform=transform, download=True)

elif dataset == "CIFAR10":
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])    
    train_dataset = datasets.CIFAR10(root='../data', train=True, transform=transform, download=True)
    test_dataset = datasets.CIFAR10(root='../data', train=False, transform=transform, download=True)
    
else:
    raise ValueError(f"Unsupported dataset: {dataset}")

_smp_data, _ = train_dataset[0]
num_channels, height, width = _smp_data.shape
num_classes = len(train_dataset.classes)

print("Dataset:", dataset)
print("Number of Channels:", num_channels)
print("Width:", width)
print("Height:", height)
print("Number of Classes:", num_classes)

Dataset: MNIST
Number of Channels: 1
Width: 28
Height: 28
Number of Classes: 10


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class DefaultLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(DefaultLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.weight, -bound, bound)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input, self.weight, self.bias)


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class USFGradFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        return F.linear(input, weight, bias)

    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            # grad_input = grad_output.mm(weight) 
            grad_input = grad_output.mm( weight.sign() ) / math.sqrt(weight.size(1)) 
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
            # grad_weight = grad_output.t().mm(input.sign()) / math.sqrt(input.size(1))
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias

class USFLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(USFLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.weight, -bound, bound)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return USFGradFunction.apply(input, self.weight, self.bias)


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class SBPGradFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        return F.linear(input, weight, bias)

    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            # grad_input = grad_output.mm(weight) 
            grad_input = grad_output.mm( weight.sign() ) / math.sqrt(weight.size(1)) 
        if ctx.needs_input_grad[1]:
            # grad_weight = grad_output.t().mm(input)
            grad_weight = grad_output.t().mm(input.sign()) / math.sqrt(input.size(1))
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias

class SBPLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(SBPLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.weight, -bound, bound)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return SBPGradFunction.apply(input, self.weight, self.bias)


In [17]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, linear_layer):
        super(Network, self).__init__()
        self.layer1 = linear_layer(num_channels * height * width, 512)
        self.layer2 = linear_layer(512, 256)
        self.layer3 = linear_layer(256, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

def train_and_evaluate(model, train_loader, test_loader, device=None):
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.cross_entropy(output, target)
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.cross_entropy(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        print(f'Epoch {epoch + 1}: Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("Training with DefaultLinear")
default_model = Network(DefaultLinear)
train_and_evaluate(default_model, train_loader, test_loader, device)

print("Training with USFLinear")
usf_model = Network(USFLinear)
train_and_evaluate(usf_model, train_loader, test_loader, device)

print("Training with SBPLinear")
sbp_model = Network(SBPLinear)
train_and_evaluate(sbp_model, train_loader, test_loader, device)


Training with DefaultLinear
Epoch 1: Test set: Average loss: 0.1188, Accuracy: 96.28%
Epoch 2: Test set: Average loss: 0.0912, Accuracy: 97.02%
Epoch 3: Test set: Average loss: 0.0768, Accuracy: 97.50%
Training with USFLinear
Epoch 1: Test set: Average loss: 0.1541, Accuracy: 95.09%
Epoch 2: Test set: Average loss: 0.0993, Accuracy: 96.91%
Epoch 3: Test set: Average loss: 0.0760, Accuracy: 97.48%
Training with SBPLinear
Epoch 1: Test set: Average loss: 0.5307, Accuracy: 87.57%
Epoch 2: Test set: Average loss: 0.4068, Accuracy: 89.50%
Epoch 3: Test set: Average loss: 0.3631, Accuracy: 90.13%
