In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [17]:
from numpy.random import randn
import numpy as np
import os

In [18]:
N, D_in, H, D_out = 16, 6, 5, 2

In [19]:
np.random.seed(1)
x = randn(N, D_in)
np.random.seed(2)
y = randn(N, D_out)

In [20]:
np.random.seed(3)
w1 = randn(D_in, H) 
np.random.seed(23)
b1 = randn(H)
np.random.seed(4)
w2 = randn(H, D_out)
np.random.seed(24)
b2 = randn(D_out)
w1.shape, b1.shape, w2.shape, b2.shape

((6, 5), (5,), (5, 2), (2,))

In [218]:
use_bias = False
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(6, 5, bias=use_bias)
        self.l2 = nn.Linear(5, 2, bias=use_bias)
        
    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        x= self.l2(x)
        return x

In [223]:
class DAD(nn.Module):
    def __init__(self, model):
        super(DAD, self).__init__()
        self.in_activations = []
        self.out_activations = []
        self.model = model
        
    def forward(self, *inputs, **kwargs):
        childrens = dict(self.model.named_children())
        for k, ch in childrens.items():
            ch.register_forward_hook(self.hook_wrapper('forward', k))
        return self.model(*inputs, **kwargs)
    
    def hook_wrapper(self, hook_type, layer):
        def fw_hook(a, in_act, out_act):
            print(f'----IN----------------------')
            print(in_act)
            print(f'{out_act.shape}----OUT----------------------')
            print(out_act)
            self.out_activations.append(out_act)
            
        return fw_hook

In [233]:
net = Net()

In [230]:
list(net.parameters())

[Parameter containing:
 tensor([[ 0.2008,  0.2095,  0.0986,  0.0346,  0.1878,  0.3414],
         [-0.2846,  0.3429,  0.1499, -0.3898,  0.0625, -0.2702],
         [-0.3645, -0.0525,  0.3633,  0.2582,  0.2732, -0.3415],
         [-0.1669, -0.3569,  0.3770, -0.0248, -0.0676, -0.1150],
         [ 0.0028,  0.0976,  0.1065,  0.3120, -0.0106,  0.0151]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.1608, -0.4034,  0.0235,  0.2031, -0.1082],
         [-0.1566, -0.2567,  0.1993,  0.0855, -0.3974]], requires_grad=True)]

In [225]:
# for (n, m), w in zip(net.named_parameters(), [w1, w2]):
#     if 'bias' in n:
#         m.data = torch.FloatTensor(w)
#     elif 'weight' in n:
#         m.data = torch.FloatTensor(w.T)

In [234]:
optim = torch.optim.Adam(net.parameters(), lr=0.1)
optim.zero_grad()

In [241]:
[p.grad for p in net.parameters()]

[tensor([[ 0.7320, -0.7529, -0.7772, -0.6544,  0.0859, -0.8904],
         [ 0.4953, -0.4150, -0.5639, -0.4966,  0.0444, -0.4534],
         [-0.9351,  0.5704,  0.3056,  0.7422, -0.3246,  0.8923],
         [-0.7102,  0.4459, -0.1479,  0.6183, -0.4304,  0.7054],
         [ 0.8702, -0.5116,  0.1916, -0.5230,  0.5253, -0.9120]]),
 tensor([[ 7.7468,  7.6690,  6.5837,  4.5025,  7.6624],
         [-4.8424, -4.6252, -4.7298, -2.6999, -2.7696]])]

In [239]:
o = net(torch.FloatTensor(x))

In [240]:
loss = torch.square(o-torch.Tensor(y)).sum()
loss.backward()

In [194]:
net.out_activations

[tensor([[-3.0949e-01,  6.7701e-01, -2.7613e-01,  6.1981e-01, -1.1317e-01],
         [-5.4146e-01,  9.5455e-01, -4.4751e-02,  7.4017e-01,  1.1374e-01],
         [-5.1835e-02, -6.1387e-01, -4.1699e-01, -7.7695e-02,  1.9880e-01],
         [-3.8336e-01,  8.9481e-01, -1.8748e-01,  2.8392e-01, -3.6653e-01],
         [-4.2680e-01, -3.5746e-01, -1.2701e-01,  6.0326e-01, -2.2469e-01],
         [ 5.0284e-02, -4.0308e-01, -5.9826e-01, -9.1199e-02, -5.0808e-01],
         [-1.6316e-01, -1.7522e-01, -4.6604e-02, -3.2974e-01,  2.5828e-01],
         [-6.3542e-02, -8.3247e-01, -5.3156e-01, -1.4438e-01,  3.1628e-01],
         [-2.6280e-01, -5.6114e-01,  1.3881e-01,  1.3879e-01,  4.9511e-03],
         [-4.7197e-01,  2.0723e-04,  1.0304e-01,  1.7678e-01,  1.8024e-01],
         [ 3.0157e-03, -2.6942e-01, -4.8579e-01, -2.8131e-01,  3.6532e-01],
         [-4.4944e-01, -1.5067e+00,  5.8814e-01,  3.4188e-01,  9.8081e-01],
         [-4.4859e-02, -1.1928e+00, -5.0857e-01,  9.1092e-02,  3.5961e-01],
         [-2

In [195]:
for t in range(10):
    h = 1/(1+np.exp(-x.dot(w1)))
    print('h:', h.shape)
    y_pred = h.dot(w2)
    loss1 = np.square(y_pred-y).sum()
#     print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    print('grad_y_pred:', grad_y_pred.shape)
    
    grad_w2 = h.T.dot(grad_y_pred)
    print('grad_w2: ', grad_w2.shape)
    
    
    grad_h = grad_y_pred.dot(w2.T)
    print('grad_h: ', grad_h.shape)
    grad_w1 = x.T.dot(grad_h * h * (1-h))
    print('grad_w1:', grad_w1.shape)
    
    w1 -= 1e-4 * grad_w1
    w2 -= 1e-4 * grad_w2
    break

h: (16, 5)
grad_y_pred: (16, 2)
grad_w2:  (5, 2)
grad_h:  (16, 5)
grad_w1: (6, 5)


In [196]:
torch.autograd.grad()

TypeError: grad() missing 2 required positional arguments: 'outputs' and 'inputs'

In [199]:
grads = []
for op in net.out_activations[::-1]:
    grads.append(torch.autograd.grad(loss, op, retain_graph=True)[0])
    print('------------')
grads

------------
------------


[tensor([[-0.3580,  0.9204],
         [ 3.1076, -2.4235],
         [ 2.3793,  2.2546],
         [-2.0858,  3.2493],
         [ 0.8612,  2.5231],
         [-2.2808, -4.0159],
         [-1.1597,  2.7714],
         [-2.3025,  1.7549],
         [-1.1387, -1.7888],
         [ 0.3678,  0.6395],
         [ 0.6054,  0.8864],
         [-1.7900,  2.5088],
         [-0.6257,  1.0459],
         [ 0.0955,  3.0421],
         [ 1.6844,  1.0190],
         [-0.6838, -3.7083]]),
 tensor([[-0.0482,  0.0227, -0.0753,  0.1077,  0.0257],
         [ 0.0352,  0.0271,  0.2937, -0.4783, -0.1163],
         [-0.2594,  0.1949, -0.0482, -0.0638, -0.0102],
         [-0.1343,  0.0424, -0.3032,  0.4970,  0.1059],
         [-0.2043,  0.1506, -0.1320,  0.1136,  0.0287],
         [ 0.3788, -0.2783,  0.1579, -0.0986, -0.0241],
         [-0.1442,  0.0722, -0.2346,  0.3555,  0.0781],
         [-0.0251, -0.0227, -0.2005,  0.3996,  0.0835],
         [ 0.1711, -0.1247,  0.0713, -0.0305, -0.0087],
         [-0.0573,  0.0464, -0

In [92]:
grad_y_pred

array([[-0.14128343, -2.91890857],
       [ 2.57515734, -5.18890603],
       [ 0.82365203, -0.85523361],
       [-1.73909922,  2.8385638 ],
       [ 0.47695332, -0.42796761],
       [-2.07359372, -6.39049049],
       [-2.46923772,  1.58436757],
       [-3.14404445,  0.38541964],
       [-1.38512863, -2.64496736],
       [-1.07988552,  0.66171606],
       [-0.72291186, -0.88400441],
       [-3.08388612,  0.29422503],
       [-1.28009607, -2.52365231],
       [-0.16070638,  1.55309877],
       [ 1.76819895,  0.30772287],
       [-0.9863842 , -6.1461239 ]])

In [157]:
def a(_):
    print(_)

In [159]:
a(10)

10


In [165]:
list(net.named_children())

[('model',
  Net(
    (l1): Linear(in_features=6, out_features=5, bias=True)
    (l2): Linear(in_features=5, out_features=2, bias=True)
  ))]

In [185]:
m = list(net.model.children())