# Code sample from the article

This notebook contains the code used by the part of the article "How to initialize a neural net?"

In [1]:
import torch
import torch.nn.functional as F
import math

Why it's important to initialize neural net

In [2]:
# x is an input vector (random with mean of 0 and standard deviation of 1)
x = torch.randn(512)

# linear is a linear layer with output of size 512 (matrix 512x512)
linear = torch.nn.Linear(512, 512)

# we initialize the weight of the linear layer with random number (mean of 0 and standard deviation of 1)
torch.nn.init.normal_(linear.weight, mean=0, std=1)

# We simulate the foward pass of a neural net with 50 layers
for i in range(50): 
    x = F.relu(linear(x))
    
    #We check the stats at some layers
    if i % 10 == 0:
        print(f'Layer {i}: {x.mean()},{x.std()} ')
    
# We get exploding gradient (not a number results)
x.mean(),x.std()    

Layer 0: 8.602901458740234,12.32577133178711 
Layer 10: 10889875947520.0,16345840222208.0 
Layer 20: 1.4060994929618463e+25,2.119802061207111e+25 
Layer 30: inf,2.1571706605529397e+37 
Layer 40: nan,nan 


(tensor(nan, grad_fn=<MeanBackward0>), tensor(nan, grad_fn=<StdBackward0>))

In [3]:
# x is an input vector (random with mean of 0 and standard deviation of 1)
x = torch.randn(512)

# linear is a linear layer with output of size 512 (matrix 512x512)
linear = torch.nn.Linear(512, 512)
# we initialize the weight of the linear layer with random number
torch.nn.init.normal_(linear.weight, mean=0, std=1)

with torch.no_grad():
# We multiply the random number by math.sqrt(2/512)
    linear.weight = torch.nn.Parameter(linear.weight * math.sqrt(2/512))
    
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01, momentum=0.9)


# We simulate the foward pass of a neural net with 50 layers
for i in range(50): 
    x = F.relu(linear(x))
    
    #We check the stats at some layers
    if i % 10 == 0:
        print(f'Layer {i}: {x.mean()},{x.std()} ')
    
# We get exploding gradient (not a number results)
x.mean(),x.std()    

Layer 0: 0.5577335953712463,0.7815624475479126 
Layer 10: 0.5380088686943054,0.7883701920509338 
Layer 20: 0.3430079519748688,0.5078677535057068 
Layer 30: 0.205487459897995,0.30858924984931946 
Layer 40: 0.12987178564071655,0.19700129330158234 


(tensor(0.0898, grad_fn=<MeanBackward0>),
 tensor(0.1362, grad_fn=<StdBackward0>))

In [4]:
# Torch.randn generates numbers from a standard distribution (mean of 0 and std of 1)
x = torch.randn(512)
W = torch.randn(512,512)

# Each element of the y vector will be calculated by the sum of 512 product 
# (each product is between a weight of W and an input of x, both independant and from normal distribution)
y = x @ W

# What is the variance of a sum of 512 elements (each from a stantard distrib with a variance of 1)?
# Answer: it should be around 512
print(f'Variance of y vector: {y.var()}')

# We know that the standard deviation is the square root of the variance. 
# So standard deviation of y should be close to the square root of 512
print(f'Standard deviation of y vector: {y.std()}.')
print(f'Should be close to square root of vector size: {math.sqrt(512)}')

Variance of y vector: 496.4143371582031
Standard deviation of y vector: 22.280357360839844.
Should be close to square root of vector size: 22.627416997969522
