In [1]:
import numpy as np

In [2]:
batch_size = 32
timesteps = 60
feature_dim = 128
steps_dim = 60

In [3]:
# weight vector
W = np.random.randint(5, size=(feature_dim, ))
W.shape

(128,)

\begin{align}
e_{t} & = \tanh(W_a*h_t + b_a) \\
\alpha_{t} & = exp^{e_t} /(\sum_{k=1}^t exp^{e_t}) \\ 
c & = \sum_{k=1}^t \alpha_k * h_k \\ 
\end{align}

#### Step 1

\begin{align}
e_{t} & = \tanh(W_a*h_t + b_a) \\
\end{align}

1. From Bi directional LSTM we have set return_sequences=True thus it generates sequences
2. We have set max sequence length 60 thus timesteps will be 60
3. Each timestep is a 128 dimensional vector
4. Output at each timestep is 128 dimensional vector
5. 60 timesteps then h will (60, 128)
6. for 30 batches its is (30, 60, 128)
7. Output from LSTM Layer will be (batch_size, timesteps, feature_dim) ==> (32, 60, 128)

In [4]:
input_vector = np.random.randint(5, size=(batch_size, timesteps, feature_dim))
print("Shape of input vector:", input_vector.shape)
input_vector[:2,:, : ]

Shape of input vector: (32, 60, 128)


array([[[3, 4, 3, ..., 4, 0, 1],
        [4, 3, 0, ..., 2, 3, 1],
        [0, 4, 2, ..., 2, 3, 1],
        ...,
        [4, 0, 1, ..., 3, 2, 1],
        [0, 0, 2, ..., 1, 4, 4],
        [0, 2, 3, ..., 3, 1, 4]],

       [[3, 2, 0, ..., 2, 2, 3],
        [2, 3, 0, ..., 0, 1, 1],
        [4, 4, 3, ..., 2, 3, 0],
        ...,
        [2, 4, 1, ..., 4, 2, 0],
        [2, 1, 4, ..., 4, 2, 2],
        [0, 3, 4, ..., 4, 3, 2]]])

In [5]:
# reshape the array to [batch_size * timesteps, feature_dim]
# batch_input_h_t ==> Will be all h_t(output at timestamp t) of every batch
# batch_size=32, timesteps=60 ==> 1920 h_t
batch_input_h_t = np.reshape(input_vector, (-1, feature_dim))
batch_input_h_t.shape

(1920, 128)

In [6]:
# reshape weights vector to (feature_dim, 1) from (feature_dim, 1)
W_reshaped = np.reshape(W, (feature_dim, 1))
W_reshaped.shape

(128, 1)

In [7]:
# Multiply weights and h_t(output at timestamp t)
eij = np.dot(batch_input_h_t, W_reshaped)
eij.shape

(1920, 1)

In [8]:
# reshape to (batch_size, timestamps) ==> (32, 60)
eij = eij.reshape(-1, steps_dim)
print(eij.shape)

(32, 60)


In [9]:
# bias
bias = np.random.randint(5, size=(steps_dim,))
bias

array([0, 1, 4, 0, 1, 2, 0, 3, 1, 4, 3, 0, 4, 0, 1, 0, 3, 2, 4, 2, 0, 2,
       3, 1, 0, 2, 0, 3, 1, 0, 2, 2, 1, 3, 2, 3, 2, 3, 0, 1, 3, 1, 4, 3,
       0, 2, 4, 1, 0, 0, 3, 1, 0, 2, 3, 3, 3, 4, 1, 0])

In [10]:
# add bias 
eij = eij + bias
print(eij.shape)

(32, 60)


In [11]:
# Apply tanh
eij = np.tanh(eij)
print(eij.shape)

(32, 60)


#### Step 2

\begin{align}
\alpha_{t} & = exp^{e_t} /(\sum_{k=1}^t exp^{e_t}) \\  
\end{align}

In [12]:
# calculate exponent of each input
a = np.exp(eij)
print(a.shape)

(32, 60)


In [13]:
# calculate sum of all timesteps
a_sum = np.sum(a, axis=1, keepdims=True)
# output will have (batch_size, 1)
print(a_sum.shape)

(32, 1)


In [14]:
# calculate alpha_t
alpha_t = a/a_sum
print(alpha_t.shape)

(32, 60)


In [15]:
# expand dimensions 
alpha_t = np.expand_dims(alpha_t, axis=-1)
print(alpha_t.shape)

(32, 60, 1)


#### Step 3

\begin{align}
c & = \sum_{k=1}^t \alpha_k * h_k \\ 
\end{align}

In [16]:
# We have alpha_t and h_t
# multiply both of them
alpha_t_h_t = input_vector * alpha_t
print(alpha_t_h_t.shape)

(32, 60, 128)


In [17]:
# Sum all timesteps
c = np.sum(alpha_t_h_t, axis=1)
print(c.shape)

(32, 128)
