In [19]:
import torch
import torch.nn as nn

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
x = torch.ones(5)
print(x, x.shape)
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = x @ w + b

loss = nn.functional.binary_cross_entropy_with_logits(z, y)

print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

print('-----------------------------Pytorch Jacobian-----------------------------')

loss.backward()
print(w.grad)
print(b.grad)

print('-----------------------------My own Jacobian-----------------------------')
epsilon = 1e-3 * 0.75 
loss_fn = nn.functional.binary_cross_entropy_with_logits

jacobian = torch.zeros_like(w)


for i in range(w.shape[0]):
    for j in range(w.shape[1]):
        w_pos = w.clone()
        w_neg = w.clone()
        
        w_pos[i, j] += epsilon
        w_neg[i, j] -= epsilon
        
        z_pos = x @ w_pos + b
        z_neg = x @ w_neg + b
        
        loss_pos = loss_fn(z_pos, y)
        loss_neg = loss_fn(z_neg, y)
        
        jacobian[i, j] = (loss_pos - loss_neg) / (2 * epsilon)

print("Jacobian matrix (dL/dW):")
print(jacobian)

tensor([1., 1., 1., 1., 1.]) torch.Size([5])
Gradient function for z = <AddBackward0 object at 0x7f6b574a3610>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f6b574b3970>
-----------------------------Pytorch Jacobian-----------------------------
tensor([[0.0689, 0.0125, 0.2136],
        [0.0689, 0.0125, 0.2136],
        [0.0689, 0.0125, 0.2136],
        [0.0689, 0.0125, 0.2136],
        [0.0689, 0.0125, 0.2136]])
tensor([0.0689, 0.0125, 0.2136])
-----------------------------My own Jacobian-----------------------------
Jacobian matrix (dL/dW):
tensor([[0.0689, 0.0124, 0.2136],
        [0.0689, 0.0124, 0.2136],
        [0.0689, 0.0124, 0.2136],
        [0.0689, 0.0124, 0.2136],
        [0.0689, 0.0124, 0.2136]], grad_fn=<CopySlices>)


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a multi-layer neural network with variable number of layers
class MultiLayerNet(nn.Module):
    def __init__(self, layer_sizes):
        super(MultiLayerNet, self).__init__()
        layers = []
        for i in range(len(layer_sizes) - 1):
            layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            if i < len(layer_sizes) - 2:
                layers.append(nn.ReLU())  # Adding ReLU activation between layers except last
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# Create an instance of the network with 2 hidden layers
layer_sizes = [28*28, 64, 32, 10]  # Input size: 28*28 (MNIST images), Hidden sizes: 256, 128, Output size: 10 (classes)
model = MultiLayerNet(layer_sizes).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, optimizer, criterion, train_loader, device):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.view(-1, 28*28).to(device), target.to(device)
        
        # Forward pass
        output = model(data)
        loss = criterion(output, target)
        
        # Backward pass using central difference gradient computation
        model.zero_grad()
        layer_gradients = compute_central_difference_gradients(model, data, target, output, loss, criterion)
        print(layer_gradients)
        # Update weights
        for param, grad in zip(model.parameters(), layer_gradients):
            param.grad = grad.to(device)
        
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Train Epoch: [batch_idx={batch_idx}] Loss: {loss.item()}')

# Function to compute gradients using central difference
def compute_central_difference_gradients(model, data, target, output, loss, criterion, epsilon=1e-3):
    layer_gradients = [torch.zeros_like(param) for param in model.parameters()]
    
    # Ensure we are in training mode for dropout, batch norm, etc.
    model.train()

    # Iterate through layers
    for layer_idx, layer in enumerate(model.layers):
        if isinstance(layer, nn.Linear):
            print(layer_gradients[layer_idx].shape)
            for i in range(layer.weight.shape[0]):
                for j in range(layer.weight.shape[1]):
                    print(i, j)
                    # Perturb weights positively
                    with torch.no_grad():
                        layer.weight.data[i, j] += epsilon
                        z_pos = model(data)
                        loss_pos = criterion(z_pos, target)
                        
                    # Perturb weights negatively
                        layer.weight.data[i, j] -= 2 * epsilon
                        z_neg = model(data)
                        loss_neg = criterion(z_neg, target)
                        
                        # Central difference gradient approximation
                        central_difference_grad = (loss_pos - loss_neg) / (2 * epsilon)

                        
                        # Set the layer_gradients appropriately
                        layer_gradients[layer_idx][i, j] = central_difference_grad
                        

                        layer.weight.data[i, j] += epsilon
            print(layer_gradients[layer_idx])
    return layer_gradients


# Train the model
train(model, optimizer, criterion, train_loader, device)


torch.Size([64, 784])
0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
0 72
0 73
0 74
0 75
0 76
0 77
0 78
0 79
0 80
0 81
0 82
0 83
0 84
0 85
0 86
0 87
0 88
0 89
0 90
0 91
0 92
0 93
0 94
0 95
0 96
0 97
0 98
0 99
0 100
0 101
0 102
0 103
0 104
0 105
0 106
0 107
0 108
0 109
0 110
0 111
0 112
0 113
0 114
0 115
0 116
0 117
0 118
0 119
0 120
0 121
0 122
0 123
0 124
0 125
0 126
0 127
0 128
0 129
0 130
0 131
0 132
0 133
0 134
0 135
0 136
0 137
0 138
0 139
0 140
0 141
0 142
0 143
0 144
0 145
0 146
0 147
0 148
0 149
0 150
0 151
0 152
0 153
0 154
0 155
0 156
0 157
0 158
0 159
0 160
0 161
0 162
0 163
0 164
0 165
0 166
0 167
0 168
0 169
0 170
0 171
0 172
0 173
0 174
0 175
0 176
0 177
0 178
0 179
0 180
0 

KeyboardInterrupt: 