# Does nn.Conv2d init work well?

In [1]:
#export
from exp.matmul import *

In [2]:
#export
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    
    return map(tensor, (x_train, y_train, x_valid, y_valid))


def normalize(x, mu, sig): return (x - mu) / sig

In [3]:
x_train, y_train, x_valid, y_valid = get_data()

In [4]:
train_mean, train_sig = x_train.mean(), x_train.std()

In [5]:
x_train = normalize(x_train, train_mean, train_sig)
x_valid = normalize(x_valid, train_mean, train_sig)

In [6]:
x_train = x_train.view(-1, 1, 28, 28)
x_valid = x_valid.view(-1, 1, 28, 28)

x_train.shape, x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [7]:
n, *_ = x_train.shape
c = y_train.max() + 1
nh = 32
n, c

(50000, tensor(10))

In [8]:
l1 = torch.nn.Conv2d(1, nh, 5)  # 5 x 5 kernel

In [9]:
x = x_valid[:100]  # Let's grab the first 100 images

In [10]:
x.shape

torch.Size([100, 1, 28, 28])

In [11]:
def stats(x): return x.mean(), x.std()

In [12]:
stats(l1.weight), stats(l1.bias)

((tensor(0.0018, grad_fn=<MeanBackward0>),
  tensor(0.1127, grad_fn=<StdBackward0>)),
 (tensor(0.0015, grad_fn=<MeanBackward0>),
  tensor(0.1120, grad_fn=<StdBackward0>)))

In [13]:
#export
from torch.nn import init
from torch import nn

In [14]:
t = l1(x)

In [15]:
stats(t)

(tensor(0.0062, grad_fn=<MeanBackward0>),
 tensor(0.6202, grad_fn=<StdBackward0>))

The mean is 0, but the std isn't 1. Let's compare this to the Normal Kaiming init.

In [16]:
init.kaiming_normal_(l1.weight, a=1.)
stats(l1(x))

(tensor(0.0050, grad_fn=<MeanBackward0>),
 tensor(1.1172, grad_fn=<StdBackward0>))

In [17]:
import torch.nn.functional as F

In [18]:
def f1(x, a=0): return F.leaky_relu(l1(x), a)

In [20]:
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5419, grad_fn=<MeanBackward0>),
 tensor(1.0342, grad_fn=<StdBackward0>))

The variance is about 1, which is good! But if we initialize with the default initializer, then the std isn't 1.

In [21]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [22]:
# Get the receptive filter size
rec_fs = l1.weight[0, 0].numel()  # should get us 5 x 5
rec_fs

25

In [25]:
nf, ni, *_ = l1.weight.shape
nf, ni

(32, 1)

In [26]:
fan_in = ni * rec_fs
fan_out = nf * rec_fs
fan_in, fan_out

(25, 800)

In [27]:
def gain(a): return math.sqrt(2.0 / (1 + a ** 2))

In [29]:
(gain(1),  # linear, no activation
gain(0),  # standard relu--this was shown in the Kaiming He paper
gain(0.01), # leaky relu
gain(0.1), 
gain(math.sqrt(5)))  # PyTorch init--it's quite far away

(1.0,
 1.4142135623730951,
 1.4141428569978354,
 1.4071950894605838,
 0.5773502691896257)

But PyTorch uses Kaiming uniform, not Kaiming normal. Now if you take a uniform[-1, 1], then the standard deviation is $\frac{1}{\sqrt{3}}$. So part of the reason for the difference is that in PyTorch, they needed a gain to handle uniform random numbers rather than just normal random numbers. But it still doesn't quite account for the difference.

In [30]:
1 / math.sqrt(3.)

0.5773502691896258

In [33]:
def kaiming2(x, a, use_fan_out=False):
    nf, ni, *_ = x.shape
    rec_fs = x[0, 0].numel()
    fan = nf * rec_fs if use_fan_out else ni * rec_fs
    std = gain(a) / math.sqrt(fan)  # so far the same as above
    
    bound = math.sqrt(3.) * std  # because of the uniform random numbers
    x.data.uniform_(-bound, bound)

In [34]:
kaiming2(l1.weight, a=0);
stats(f1(x))

(tensor(0.5482, grad_fn=<MeanBackward0>),
 tensor(1.0469, grad_fn=<StdBackward0>))

This looks good.

In [35]:
kaiming2(l1.weight, a=math.sqrt(5.))
stats(f1(x))

(tensor(0.2288, grad_fn=<MeanBackward0>),
 tensor(0.4199, grad_fn=<StdBackward0>))

And now you get the same as the PyTorch default. So what does this mean? Let's throw together a quick conv net.

In [36]:
class Flatten(nn.Module):
    def forward(self,x): return x.view(-1)

In [37]:
m = nn.Sequential(
    nn.Conv2d(1,8, 5,stride=2,padding=2), nn.ReLU(),
    nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(32,1,3,stride=2,padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [38]:
y = y_valid[:100].float()

In [39]:
t = m(x)
stats(t)

(tensor(-0.0515, grad_fn=<MeanBackward0>),
 tensor(0.0104, grad_fn=<StdBackward0>))

So what happens if we put the inputs through a 4-layer conv net? We get a std of 0.01. That sounds like a really big problem--there's so little variation between the elements. And there's a big difference between the first layer and the last layer--that's the really big issue. 

In [41]:
#export
def mse(output, target): return (output.squeeze(-1) - target).pow(2).mean()

In [42]:
l = mse(t,y)
l.backward()

Now let's get the stats on the gradients for the first layer weights:

In [43]:
stats(m[0].weight.grad)

(tensor(-0.0071), tensor(0.0277))

So let's try using Kaiming uniform:

In [44]:
init.kaiming_uniform_??

In [45]:
for l in m:
    if isinstance(l,nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [46]:
t = m(x)
stats(t)

(tensor(-0.4416, grad_fn=<MeanBackward0>),
 tensor(0.2714, grad_fn=<StdBackward0>))

It's not 1, but it's a lot better than 0.01

In [47]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(0.0344), tensor(0.4495))

In [48]:
!python3 notebook2script.py 02a_why_sqrt5.ipynb

Converted 02a_why_sqrt5.ipynb to why.py
