In [2]:
import numpy as np
import matplotlib.pyplot as plt

# 8-4. Initializers

To avoid vanishing or exploding gradient, we want the activation of each layer not too smaller or larger than 1.

Careful choice of initization method can help with it.

## Xavier Initialization

$$
W \sim U \left[ -\frac{\sqrt{6}}{\sqrt{n_j + n_{j+1}}} , \frac{\sqrt{6}}{\sqrt{n_j + n_{j+1}}} \right]
$$

Works well with tanh activation. Sometime called Glorot Initialization because the name of author is Xavier Glorot.

X. Glorot, and Y. Bengio. Understanding the difficulty of training deep feedforward neural networks. Proc. AISTATS, volume 9, pp. 249-256, 2010.

In [94]:
def xavier_init(n_in, n_out):
    limit = np.sqrt(6.0 / (n_in + n_out))
    return np.random.uniform(low=-limit, high=limit, size=(n_in, n_out))

#### Not careful initialization

In [95]:
x = np.random.randn(4, 10)
w = np.random.randn(10, 5) * 0.01
z = np.dot(x, w)
a = np.tanh(z)

In [96]:
a

array([[-0.04507395,  0.01255039,  0.01269483, -0.01700625, -0.01022171],
       [ 0.01336227,  0.0051835 ,  0.03422804,  0.02580452, -0.01032969],
       [ 0.0144389 , -0.02422666,  0.03621563,  0.06837236, -0.01462751],
       [ 0.0003048 ,  0.01203136,  0.00232223, -0.02757834,  0.01105016]])

In [97]:
print(np.mean(x))
print(np.var(x))

-0.0014452073883
0.880298669146


In [98]:
print(np.mean(w))
print(np.var(w))

-0.000543260635673
0.00011873470013


In [99]:
print(np.mean(a))
print(np.var(a))

0.00497474219852
0.000621362876611


#### Xavier initialization

In [101]:
x = np.random.randn(4, 10)
w = xavier_init(10, 5)
z = np.dot(x, w)
a = np.tanh(z)

In [102]:
a

array([[ 0.63471777,  0.98118756,  0.18944596, -0.8199876 , -0.45919988],
       [-0.99015396,  0.17578035,  0.47659541, -0.83468998,  0.99182153],
       [ 0.99635009,  0.87748178,  0.05048396,  0.00991787, -0.98682464],
       [ 0.82586428,  0.1167327 , -0.6691873 ,  0.88485741, -0.97895723]])

In [103]:
print(np.mean(x))
print(np.var(x))

0.236280798853
0.968990220414


In [104]:
print(np.mean(w))
print(np.var(w))

0.01104697889
0.14875860035


In [105]:
print(np.mean(a))
print(np.var(a))

0.0736118047532
0.535959863578


## He Initialization

$$
W \sim N \left(0, \sigma \sim \sqrt{2/n_j} \right)
$$

K. He, X. Zhang, S. Ren, and J. Sun. Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. IEEE International Conference on Computer Vision (ICCV), 2015.

In [92]:
def he_init(n_in, n_out):
    return np.random.randn(n_in, n_out) * np.sqrt(2 / n_in)

#### Not careful initialization

In [153]:
x = np.random.randn(4, 10)
w = np.random.randn(10, 5) * 0.01
z = np.dot(x, w)
a = np.maximum(0, z)

In [154]:
a

array([[ 0.0024203 ,  0.0085116 ,  0.02099162,  0.        ,  0.        ],
       [ 0.        ,  0.00639004,  0.        ,  0.00015616,  0.01880949],
       [ 0.        ,  0.00519537,  0.03535715,  0.        ,  0.05381484],
       [ 0.00416783,  0.02985453,  0.00887663,  0.        ,  0.        ]])

In [155]:
print(np.mean(x))
print(np.var(x))

0.0635551887776
1.11468786193


In [156]:
print(np.mean(w))
print(np.var(w))

-0.0006936920337
8.03739166255e-05


In [157]:
print(np.mean(a))
print(np.var(a))

0.00972727695541
0.000209091187891


#### He initialization

In [158]:
x = np.random.randn(4, 10)
w = he_init(10, 5)
z = np.dot(x, w)
a = np.maximum(0, z)

In [159]:
a

array([[ 2.83892649,  0.12540959,  1.8773216 ,  0.        ,  0.        ],
       [ 3.12595268,  0.        ,  2.02350429,  0.        ,  0.        ],
       [ 0.79231206,  0.83048718,  0.        ,  0.        ,  0.0616421 ],
       [ 0.        ,  0.        ,  0.66078818,  3.10196686,  0.76032443]])

In [160]:
print(np.mean(x))
print(np.var(x))

-0.0462626760197
1.281288575


In [161]:
print(np.mean(w))
print(np.var(w))

0.0381517311581
0.227150939144


In [162]:
print(np.mean(a))
print(np.var(a))

0.80993177273
1.21520637866
