<a href="https://colab.research.google.com/github/usm-cos-432/InClass/blob/master/Chapter5/Initialization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code sample from the article

This notebook contains the code used by the part of the article "How to initialize a neural net?"

In [1]:
import torch
import torch.nn.functional as F
import math

Why it's important to initialize neural net

In [2]:
# x is an input vector (random with mean of 0 and standard deviation of 1)
x = torch.randn(512)

# linear is a linear layer with output of size 512 (matrix 512x512)
linear = torch.nn.Linear(512, 512)

# we initialize the weight of the linear layer with random number (mean of 0 and standard deviation of 1)
torch.nn.init.normal_(linear.weight, mean=0, std=1)

# We simulate the foward pass of a neural net with 50 layers
for i in range(50): 
    x = F.relu(linear(x))
    
    #We check the stats at some layers
    if i % 10 == 0:
        print(f'Layer {i}: {x.mean()},{x.std()} ')
    
# We get exploding gradient (not a number results)
x.mean(),x.std()    

Layer 0: 10.136242866516113,14.793567657470703 
Layer 10: 6122244669440.0,9260095242240.0 
Layer 20: 3.071439381426366e+24,4.396173860335744e+24 
Layer 30: inf,2.1618063320329143e+36 
Layer 40: nan,nan 


(tensor(nan, grad_fn=<MeanBackward0>), tensor(nan, grad_fn=<StdBackward0>))

In [3]:
# x is an input vector (random with mean of 0 and standard deviation of 1)
x = torch.randn(512)

# linear is a linear layer with output of size 512 (matrix 512x512)
linear = torch.nn.Linear(512, 512)
# we initialize the weight of the linear layer with random number
torch.nn.init.normal_(linear.weight, mean=0, std=1)

with torch.no_grad():
# We multiply the random number by math.sqrt(2/512)
    linear.weight = torch.nn.Parameter(linear.weight * math.sqrt(2/512))
    
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01, momentum=0.9)


# We simulate the foward pass of a neural net with 50 layers
for i in range(50): 
    x = F.relu(linear(x))
    
    #We check the stats at some layers
    if i % 10 == 0:
        print(f'Layer {i}: {x.mean()},{x.std()} ')
    
# We get exploding gradient (not a number results)
x.mean(),x.std()    

Layer 0: 0.5697970390319824,0.850481390953064 
Layer 10: 0.7701798677444458,1.2404462099075317 
Layer 20: 1.2086127996444702,1.9186478853225708 
Layer 30: 2.131618022918701,3.409432888031006 
Layer 40: 3.767212390899658,5.999336242675781 


(tensor(6.3043, grad_fn=<MeanBackward0>),
 tensor(10.0289, grad_fn=<StdBackward0>))

In [5]:
# Torch.randn generates numbers from a standard distribution (mean of 0 and std of 1)
inputSize = 512
x = torch.randn(inputSize)
W = torch.randn(inputSize,inputSize)

# Each element of the y vector will be calculated by the sum of 512 product 
# (each product is between a weight of W and an input of x, both independant and from normal distribution)
y = x @ W

# What is the variance of a sum of 512 elements (each from a stantard distrib with a variance of 1)?
# Answer: it should be around 512
print(f'Variance of y vector: {y.var()}')
print(f'Mean of y vector: {y.mean()}')

# We know that the standard deviation is the square root of the variance. 
# So standard deviation of y should be close to the square root of 512
print(f'Standard deviation of y vector: {y.std()}.')
print(f'Should be close to square root of vector size: {math.sqrt(512)}')

Variance of y vector: 477.5995788574219
Mean of y vector: 0.07702547311782837
Standard deviation of y vector: 21.85405158996582.
Should be close to square root of vector size: 22.627416997969522
