In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [55]:
from numpy.random import randn
import numpy as np
import os

In [56]:
N, D_in, H, D_out = 16, 6, 5, 2

In [57]:
mid = N//2

In [58]:
np.random.seed(1)
x = randn(N, D_in)
np.random.seed(2)
y = randn(N, D_out)

x1, x2 = x[:mid], x[mid:]
y1, y2 = y[:mid], y[mid:]

In [95]:
np.random.seed(3)
w1 = randn(D_in, H) 
np.random.seed(23)
b1 = randn(H)
np.random.seed(4)
w2 = randn(H, D_out)
np.random.seed(24)
b2 = randn(D_out)
w1.shape, b1.shape, w2.shape, b2.shape

((6, 5), (5,), (5, 2), (2,))

In [117]:
use_bias = True
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(6, 5, bias=use_bias)
        self.l2 = nn.Linear(5, 2, bias=use_bias)
        
    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        x= self.l2(x)
        return x

In [97]:
class DAD(nn.Module):
    def __init__(self, model):
        super(DAD, self).__init__()
        self.grads = []
        self.activations = []
        self.model = model
        
    def forward(self, *inputs, **kwargs):
        childrens = dict(self.model.named_children())
        for k, ch in childrens.items():
            ch.register_forward_hook(self.hook_wrapper('forward', k))
            ch.register_backward_hook(self.hook_wrapper('backward', k))
        return self.model(*inputs, **kwargs)
    
    def hook_wrapper(self, hook_type, layer):
        def hook_save(a, in_grad, out_grad):
#             print(hook_type, layer, a, in_grad, out_grad)
            if hook_type.lower() == 'forward':
                for i, b in enumerate(in_grad):
                    if b is not None:
                        self.activations.append(b)
                    break
            if hook_type.lower()=='backward':
                for i, c in enumerate(out_grad):
                    if c is not None:
                        self.grads.append(c)
                    break
        return hook_save

In [118]:
net = DAD(Net())

In [122]:
ks = list(dict(net.named_parameters()).keys())

In [130]:
ks[::2]

['model.l1.weight', 'model.l2.weight']

In [132]:
ks[::-2]

['model.l2.bias', 'model.l1.bias']

In [134]:
ks

['model.l1.weight', 'model.l1.bias', 'model.l2.weight', 'model.l2.bias']

In [136]:
list(net.model.named_children())

[('l1', Linear(in_features=6, out_features=5, bias=True)),
 ('l2', Linear(in_features=5, out_features=2, bias=True))]

In [100]:
for (n, m), w in zip(net.named_parameters(), [w1, w2]):
    if 'bias' in n:
        m.data = torch.FloatTensor(w)
    elif 'weight' in n:
        m.data = torch.FloatTensor(w.T)

In [101]:
optim = torch.optim.Adam(net.parameters(), lr=0.1)
optim.zero_grad()

In [102]:
o = net(torch.FloatTensor(x))

In [103]:
loss = torch.square(o-torch.Tensor(y)).sum()

In [104]:
loss.backward()

In [105]:
net.activations

[tensor([[ 1.6243, -0.6118, -0.5282, -1.0730,  0.8654, -2.3015],
         [ 1.7448, -0.7612,  0.3190, -0.2494,  1.4621, -2.0601],
         [-0.3224, -0.3841,  1.1338, -1.0999, -0.1724, -0.8779],
         [ 0.0422,  0.5828, -1.1006,  1.1447,  0.9016,  0.5025],
         [ 0.9009, -0.6837, -0.1229, -0.9358, -0.2679,  0.5304],
         [-0.6917, -0.3968, -0.6872, -0.8452, -0.6712, -0.0127],
         [-1.1173,  0.2344,  1.6598,  0.7420, -0.1918, -0.8876],
         [-0.7472,  1.6925,  0.0508, -0.6370,  0.1909,  2.1003],
         [ 0.1202,  0.6172,  0.3002, -0.3522, -1.1425, -0.3493],
         [-0.2089,  0.5866,  0.8390,  0.9311,  0.2856,  0.8851],
         [-0.7544,  1.2529,  0.5129, -0.2981,  0.4885, -0.0756],
         [ 1.1316,  1.5198,  2.1856, -1.3965, -1.4441, -0.5045],
         [ 0.1600,  0.8762,  0.3156, -2.0222, -0.3062,  0.8280],
         [ 0.2301,  0.7620, -0.2223, -0.2008,  0.1866,  0.4101],
         [ 0.1983,  0.1190, -0.6707,  0.3776,  0.1218,  1.1295],
         [ 1.1989,  0.185

In [106]:
net.grads

[tensor([[-0.1465, -2.9575],
         [ 2.5708, -5.2243],
         [ 0.8088, -0.8860],
         [-1.7431,  2.8321],
         [ 0.4652, -0.4630],
         [-2.0848, -6.4180],
         [-2.4813,  1.5732],
         [-3.1565,  0.3753],
         [-1.3973, -2.6701],
         [-1.0906,  0.6543],
         [-0.7350, -0.8989],
         [-3.1005,  0.2585],
         [-1.2938, -2.5556],
         [-0.1695,  1.5346],
         [ 1.7610,  0.2945],
         [-0.9949, -6.1776]]),
 tensor([[-1.1275e-02, -4.1676e-01,  1.4981e-01, -7.6879e-02,  7.5620e-01],
         [-1.3308e-01, -1.5006e+00,  1.5181e-01, -9.4760e-01,  1.4349e+00],
         [-9.0469e-02, -2.8171e-01,  5.7864e-02, -1.0170e-01,  1.4345e-01],
         [ 3.0594e-01,  5.5226e-01, -4.4375e-01,  5.4646e-01, -2.3724e-01],
         [-1.6758e-02, -1.3668e-01,  8.7033e-02, -2.0798e-02,  1.1853e-01],
         [-6.6561e-01, -5.8186e-01,  2.7518e+00, -5.2305e-01,  7.1794e-01],
         [ 1.5528e-02,  8.9019e-01, -3.6311e-01,  4.7791e-03, -6.5439e-01],
  

In [934]:
# [p.grad for p in net.parameters()]

In [908]:
# g = grad_h * h * (1-h)
# g.sum(0)

In [909]:
# b1, b2

In [107]:
for t in range(10):
    h = 1/(1+np.exp(-x.dot(w1)))
    print('h:', h.shape)
    
    y_pred = h.dot(w2)
    loss = np.square(y_pred-y).sum()
#     print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    print('grad_y_pred:', grad_y_pred.shape)
    
    grad_w2 = h.T.dot(grad_y_pred)
    print('grad_w2: ', grad_w2.shape)
    
    
    grad_h = grad_y_pred.dot(w2.T)
    print('grad_h: ', grad_h.shape)
    grad_w1 = x.T.dot(grad_h * h * (1-h))
    print('grad_w1:', grad_w1.shape)
    
    w1 -= 1e-4 * grad_w1
    w2 -= 1e-4 * grad_w2
    break

h: (16, 5)
grad_y_pred: (16, 2)
grad_w2:  (5, 2)
grad_h:  (16, 5)
grad_w1: (6, 5)


In [109]:
grad_y_pred.shape

(16, 2)

In [111]:
h.shape

(16, 5)

In [112]:
[p.shape for p in net.parameters()]

[torch.Size([5, 6]), torch.Size([2, 5])]

In [115]:
net.activations[0].shape

torch.Size([16, 6])

In [935]:
net.activations

[tensor([[ 1.6243, -0.6118, -0.5282, -1.0730,  0.8654, -2.3015],
         [ 1.7448, -0.7612,  0.3190, -0.2494,  1.4621, -2.0601],
         [-0.3224, -0.3841,  1.1338, -1.0999, -0.1724, -0.8779],
         [ 0.0422,  0.5828, -1.1006,  1.1447,  0.9016,  0.5025],
         [ 0.9009, -0.6837, -0.1229, -0.9358, -0.2679,  0.5304],
         [-0.6917, -0.3968, -0.6872, -0.8452, -0.6712, -0.0127],
         [-1.1173,  0.2344,  1.6598,  0.7420, -0.1918, -0.8876],
         [-0.7472,  1.6925,  0.0508, -0.6370,  0.1909,  2.1003],
         [ 0.1202,  0.6172,  0.3002, -0.3522, -1.1425, -0.3493],
         [-0.2089,  0.5866,  0.8390,  0.9311,  0.2856,  0.8851],
         [-0.7544,  1.2529,  0.5129, -0.2981,  0.4885, -0.0756],
         [ 1.1316,  1.5198,  2.1856, -1.3965, -1.4441, -0.5045],
         [ 0.1600,  0.8762,  0.3156, -2.0222, -0.3062,  0.8280],
         [ 0.2301,  0.7620, -0.2223, -0.2008,  0.1866,  0.4101],
         [ 0.1983,  0.1190, -0.6707,  0.3776,  0.1218,  1.1295],
         [ 1.1989,  0.185

In [742]:
0.32010551 - 0.3265

-0.00639449000000003

In [786]:
1/0.0048000000000000265

208.33333333333218

In [143]:
parm = dict(net.model.named_parameters())
parm.keys()

dict_keys(['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias'])

In [141]:
child = list(dict(net.model.named_children()).keys())
child

['l1', 'l2']

In [148]:
for c in child[::-1]:
    print(parm.get(f"{c}.bias"))

Parameter containing:
tensor([0.0944, 0.2803], requires_grad=True)
Parameter containing:
tensor([ 0.1736,  0.0088, -0.3068,  0.2007, -0.0979], requires_grad=True)


In [31]:
t1 = torch.randn(16, 784)
t2 = torch.randn(16, 784)