In [733]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
from torch.utils.data import DataLoader, Dataset
import copy
import statistics
from datetime import datetime
from sklearn.model_selection import train_test_split
import random

In [734]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Ensure deterministic algorithms
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Toy Example

## Baseline model
regular model as baseline

In [735]:
class Baseline(nn.Module):
    def __init__(self, hyperparams=dict):
        super().__init__()
        self.dimensions = hyperparams["input_dim"] + hyperparams["hidden_dim"] + hyperparams["output_dim"] # list of dimensions
        self.create_model()
    
    def forward(self, x):
        for i, layer in enumerate(self.model):
            x = layer(x)
            # Apply Sigmoid activation for all layers except the last one
            if i < len(self.model) - 1:  # Skip activation for the output layer
                # x = torch.relu(x)
                pass
        return x

    def create_model(self):
        self.model = nn.ModuleList() # initialize module list

        # Create only linear layers, activations handled in forward()
        for i in range(len(self.dimensions) - 1):
            self.model.append(nn.Linear(self.dimensions[i], self.dimensions[i + 1]))


## Custom gradient

In [736]:
# Custom Gradient Function for missing value prediction
class CustomGradient(Function):
    @staticmethod
    def missing(values):
        assert values.shape[0] == 2, "shape incorrect in missing function"
        return values[0] == values[1]

    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors  # Retrieve input saved during forward pass
        n_domains = input.shape[1] // 2  # Find the number of domains
        grad_input = grad_output.clone()  # Copy to modify

        # Modify gradients without breaking computation graph
        for i in range(input.shape[0]):
            for j in range(0, n_domains, 2):  # Loop through the entire input matrix
                pair = input[i, j:j+2]  # Check pairs using tensor operations
                if CustomGradient.missing(pair):  # Modify gradient to be 0
                    grad_input[i, j] = 0
                    grad_input[i, j + 1] = 0

        return grad_input

custom_grad = CustomGradient.apply

In [737]:
## define a custom network based on the baseline model
class CustomNet(Baseline):
    def __init__(self, hyperparams=dict):
        super().__init__(hyperparams)

    def forward(self, x):
        x = custom_grad(x)
        for i, layer in enumerate(self.model):
            x = layer(x)
            # Apply Sigmoid activation for all layers except the last one
            if i < len(self.model) - 1:  # Skip activation for the output layer
                # x = torch.relu(x)
                pass
        return x

## testing
I wanna see how the two models will change and be different
- if there's no missing, it should be the same
- if there are missing, confirm how the gradient changes by hand

In [738]:
## set universal hyperparameters
size = 14
lr = 0.01
loss_function = torch.nn.MSELoss()

In [739]:
## create example input
x = torch.rand(1, size)
target_x = x.clone().detach()

x[0, 0] = 0
x[0, 1] = 0
x[0, 4] = 0
x[0, 5] = 0
x[0, 10] = 0
x[0, 11] = 0

gradient_x = x.clone().detach().requires_grad_(True)
baseline_x = x.clone().detach().requires_grad_(True)
target_x, baseline_x, gradient_x

(tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566, 0.7936, 0.9408,
          0.1332, 0.9346, 0.5936, 0.8694, 0.5677]]),
 tensor([[0.0000, 0.0000, 0.3829, 0.9593, 0.0000, 0.0000, 0.2566, 0.7936, 0.9408,
          0.1332, 0.0000, 0.0000, 0.8694, 0.5677]], requires_grad=True),
 tensor([[0.0000, 0.0000, 0.3829, 0.9593, 0.0000, 0.0000, 0.2566, 0.7936, 0.9408,
          0.1332, 0.0000, 0.0000, 0.8694, 0.5677]], requires_grad=True))

In [740]:
## define hyperparameters
hyperparameters = dict()
hyperparameters["input_dim"] = [size]
hyperparameters["output_dim"] = [size]
hyperparameters["hidden_dim"] = [10]

In [741]:
## define baseline model and associated thingies
torch.manual_seed(42)
baseline_model = Baseline(hyperparameters)

baseline_optimizer = torch.optim.Adam(baseline_model.parameters(), lr = lr)

In [742]:
## define custom model and associated thingies
torch.manual_seed(42)
custom_model = CustomNet(hyperparameters)
## loss function and optimizer definition

custom_optimizer = torch.optim.Adam(custom_model.parameters(), lr = lr)

### training

In [743]:
# Define hook function to amplify gradients
def hook_fn(grad):
    # print("Hooked Gradient:", grad)  # Debugging output
    return grad * 10  # Amplify gradients

# Attach hooks to weights and biases
for i, layer in enumerate(custom_model.model):
    if isinstance(layer, nn.Linear):  # Apply to Linear layers only
        layer.weight.register_hook(hook_fn)  # Hook on weights
        layer.bias.register_hook(hook_fn)    # Hook on biases

# # Debugging gradient flow
# def debug_hook(module, grad_input, grad_output):
#     print(f"Layer: {module}, Grad Input: {grad_input}, Grad Output: {grad_output}")

# # Attach debugging hooks to each layer
# for layer in custom_model.model:
#     if isinstance(layer, nn.Linear):
#         layer.register_backward_hook(debug_hook)

In [744]:
## one step training
def train_one_step(model, optimizer, loss_fn, input, target):
    model_output = model(input)
    loss = loss_fn(target, model_output)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    return model_output, loss

In [745]:
def train(n_steps, model, optimizer, loss_fn, input, target):
    output = []
    loss = []
    for i in range(n_steps):
        model_output_, loss_ = train_one_step(model, optimizer, loss_fn, input, target)
        output.append(model_output_)
        loss.append(loss_)

    return output, loss

### testing

In [746]:
steps = 10

In [747]:
baseline_output, baseline_loss = train(steps, baseline_model, baseline_optimizer, loss_function, baseline_x, target_x)

In [748]:
print("Is Baseline Input Leaf?", baseline_x.is_leaf)
print("Baseline Input Gradient:", baseline_x.grad)

Is Baseline Input Leaf? True
Baseline Input Gradient: tensor([[-0.0568, -0.0400, -0.0411,  0.0521,  0.2190,  0.0415, -0.4059, -0.0967,
         -0.3956, -0.5788, -0.4435, -0.2656, -0.0733, -0.1300]])


In [749]:
custom_output, custom_loss = train(steps, custom_model, custom_optimizer, loss_function, gradient_x, target_x)

In [750]:
print("Is Custom Input Leaf?", gradient_x.is_leaf)
print("Custom Input Gradient:", gradient_x.grad)

Is Custom Input Leaf? True
Custom Input Gradient: tensor([[ 0.0000,  0.0000, -0.0411,  0.0521,  0.0000,  0.0000, -0.4059, -0.0967,
         -0.3956, -0.5788, -0.4435, -0.2656, -0.0733, -0.1300]])


## Sanity checks - how do these models behave

In [751]:
for step in range(steps):
    print(f"Step {step+1}: Output Difference: {(baseline_output[step] - custom_output[step]).abs().max().item()}")

Step 1: Output Difference: 0.0
Step 2: Output Difference: 1.1175870895385742e-07
Step 3: Output Difference: 1.7881393432617188e-07
Step 4: Output Difference: 1.341104507446289e-07
Step 5: Output Difference: 1.1920928955078125e-07
Step 6: Output Difference: 2.384185791015625e-07
Step 7: Output Difference: 3.2782554626464844e-07
Step 8: Output Difference: 4.0978193283081055e-07
Step 9: Output Difference: 2.980232238769531e-07
Step 10: Output Difference: 3.5762786865234375e-07


In [752]:
baseline_loss

[tensor(0.6675, grad_fn=<MseLossBackward0>),
 tensor(0.5744, grad_fn=<MseLossBackward0>),
 tensor(0.4944, grad_fn=<MseLossBackward0>),
 tensor(0.4233, grad_fn=<MseLossBackward0>),
 tensor(0.3577, grad_fn=<MseLossBackward0>),
 tensor(0.2957, grad_fn=<MseLossBackward0>),
 tensor(0.2368, grad_fn=<MseLossBackward0>),
 tensor(0.1821, grad_fn=<MseLossBackward0>),
 tensor(0.1332, grad_fn=<MseLossBackward0>),
 tensor(0.0920, grad_fn=<MseLossBackward0>)]

In [753]:
custom_loss

[tensor(0.6675, grad_fn=<MseLossBackward0>),
 tensor(0.5744, grad_fn=<MseLossBackward0>),
 tensor(0.4944, grad_fn=<MseLossBackward0>),
 tensor(0.4233, grad_fn=<MseLossBackward0>),
 tensor(0.3577, grad_fn=<MseLossBackward0>),
 tensor(0.2957, grad_fn=<MseLossBackward0>),
 tensor(0.2368, grad_fn=<MseLossBackward0>),
 tensor(0.1821, grad_fn=<MseLossBackward0>),
 tensor(0.1332, grad_fn=<MseLossBackward0>),
 tensor(0.0920, grad_fn=<MseLossBackward0>)]

In [754]:
print("Outputs Comparison:")
print("Baseline Output:", baseline_output[-1])
print("Gradient Output:", custom_output[-1])
print("Difference:", (baseline_output[-1] - custom_output[-1]).abs().max())

Outputs Comparison:
Baseline Output: tensor([[0.8508, 0.6859, 0.3533, 0.2704, 0.4322, 0.4066, 0.2286, 0.4641, 0.3173,
         0.0766, 0.6813, 0.8757, 0.6688, 0.3809]], grad_fn=<AddmmBackward0>)
Gradient Output: tensor([[0.8508, 0.6859, 0.3533, 0.2704, 0.4322, 0.4066, 0.2286, 0.4641, 0.3173,
         0.0766, 0.6813, 0.8757, 0.6688, 0.3809]], grad_fn=<AddmmBackward0>)
Difference: tensor(3.5763e-07, grad_fn=<MaxBackward1>)


In [755]:
# Compare input gradients
print("Gradients Comparison:")
print("Baseline Input Gradient:", baseline_x.grad)
print("Gradient Input Gradient:", gradient_x.grad)
print("Gradient Difference:", (baseline_x.grad - gradient_x.grad).abs().max())

Gradients Comparison:
Baseline Input Gradient: tensor([[-0.0568, -0.0400, -0.0411,  0.0521,  0.2190,  0.0415, -0.4059, -0.0967,
         -0.3956, -0.5788, -0.4435, -0.2656, -0.0733, -0.1300]])
Gradient Input Gradient: tensor([[ 0.0000,  0.0000, -0.0411,  0.0521,  0.0000,  0.0000, -0.4059, -0.0967,
         -0.3956, -0.5788, -0.4435, -0.2656, -0.0733, -0.1300]])
Gradient Difference: tensor(0.2190)


In [756]:
# Enable gradients for inputs
baseline_input = baseline_x.clone().detach().requires_grad_(True)
gradient_input = gradient_x.clone().detach().requires_grad_(True)

for i, (b_layer, g_layer) in enumerate(zip(baseline_model.model, custom_model.model)):
    # Forward pass
    baseline_input = b_layer(baseline_input)
    gradient_input = g_layer(gradient_input)

    # Retain gradients for intermediate outputs
    baseline_input.retain_grad()
    gradient_input.retain_grad()

    # Print output differences layer-by-layer
    print(f"Layer {i} Output Difference: {(baseline_input - gradient_input).abs().max().item()}")

# Compute MSE Loss
baseline_loss = F.mse_loss(baseline_input, target_x)
gradient_loss = F.mse_loss(gradient_input, target_x)

# Backward pass for gradients
baseline_loss.backward(retain_graph=True)
gradient_loss.backward(retain_graph=True)

# Compare gradients for weights and biases layer-by-layer
for j, (b_layer, g_layer) in enumerate(zip(baseline_model.model, custom_model.model)):
    if isinstance(b_layer, nn.Linear):  # Only compare Linear layers
        # Weight gradient difference
        weight_diff = (b_layer.weight.grad - g_layer.weight.grad).abs().max().item()
        # Bias gradient difference
        bias_diff = (b_layer.bias.grad - g_layer.bias.grad).abs().max().item()

        print(f"Layer {j} Weight Gradient Difference: {weight_diff}")
        print(f"Layer {j} Bias Gradient Difference: {bias_diff}")


Layer 0 Output Difference: 4.172325134277344e-07
Layer 1 Output Difference: 3.8743019104003906e-07
Layer 0 Weight Gradient Difference: 0.9944490194320679
Layer 0 Bias Gradient Difference: 1.0366342067718506
Layer 1 Weight Gradient Difference: 1.7068126201629639
Layer 1 Bias Gradient Difference: 1.6445785760879517


In [757]:
baseline_weight = baseline_model.model[0].weight
gradient_weight = custom_model.model[0].weight

baseline_bias = baseline_model.model[0].bias
gradient_bias = custom_model.model[0].bias

print("Weight Difference:", (baseline_weight - gradient_weight).abs().max().item())
print("Bias Difference:", (baseline_bias - gradient_bias).abs().max().item())

Weight Difference: 2.5331974029541016e-07
Bias Difference: 8.940696716308594e-08


Bad pipe message: %s [b'nc\x12H\xd7k\x0fmm\xfe~\x94\r\xd2.\xa8\xc1O\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00=\x00>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00\x9e\x00\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xba\x00\xbb\x00\xbc\x00\xbd\x00\xbe\x00\xbf\x00\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x13\x01\x13\x02\x13\x03\x13']
Bad pipe message: %s [b'\x05\xc0\x01', b"\xc0\x03\xc0\x04\xc0\x05\xc0\x06\xc0\x07\xc0\x08\xc0\t\xc0\n\xc0\x0b\xc0\x0c\xc0\r\xc0\x0e\xc0\x0f\xc0\x10\xc0\x11\xc0\x12\xc0\x13\xc0\x14\xc0\x15\xc0\x16\xc0\x17\xc0\x18\xc0\x19\xc0#\xc0$\xc