In [1]:
import numpy as np

In [2]:
word_SOS = [1,0,0,0]
word_Let = [0,1,0,0]
word_to = [0,0,1,0]
word_go = [0,0,0,1]

In [3]:
word2vec = np.array(
    [[1.16, -0.77], 
     [-0.27,0.82],
     [-2.19, 0.89],
     [3.5, -1.74]]
)
word2vecT = np.transpose(word2vec)

In [4]:
sentence_matrix = np.array(
    [word_SOS,
     word_Let,
     word_go]
)

# Encoder

### Step 1: Word embedding

In [5]:
result1_1 = np.dot(sentence_matrix, word2vec)
np.around(result1_1, decimals=2)

array([[ 1.16, -0.77],
       [-0.27,  0.82],
       [ 3.5 , -1.74]])

### Step 2: Add position

In [6]:
position_matrix = np.array(
    [[0, 1],
     [0.84, 0.54],
     [0.91, -0.42]]
)
result1_2 = result1_1+position_matrix
np.around(result1_2, decimals=2)

array([[ 1.16,  0.23],
       [ 0.57,  1.36],
       [ 4.41, -2.16]])

### Step 3: Transformer

In [7]:
W_Q = np.array(
    [[2.22, 0.41],
     [0.17, -0.51]]
)
W_K = np.array(
    [[-1.82, 0.57],
     [1.36, -0.38]]
)
W_V = np.array(
    [[-0.43, -0.59],
     [1.33, -2.15]]
)

In [8]:
result1_3_Q = np.dot(result1_2, W_Q)
result1_3_K = np.dot(result1_2, W_K)
result1_3_V = np.dot(result1_2, W_V)

In [9]:
result1_3_QK = np.dot(result1_3_Q, np.transpose(result1_3_K))/np.sqrt(2)

In [10]:
def softmax(row):
    exp_row = np.exp(row)
    sum_exp = np.sum(exp_row)
    return exp_row / sum_exp
result1_3_QK_softmax = np.apply_along_axis(softmax, 1, result1_3_QK)
np.around(result1_3_QK_softmax, decimals=2)

array([[0.01, 0.99, 0.  ],
       [0.05, 0.95, 0.  ],
       [0.  , 1.  , 0.  ]])

In [11]:
result1_3 = np.dot(result1_3_QK_softmax, result1_3_V)
np.around(result1_3, decimals=2)

array([[ 1.55, -3.24],
       [ 1.48, -3.16],
       [ 1.56, -3.26]])

### Step 4: Add resuidual connection

In [12]:
result1_4 = result1_3 + result1_2
np.around(result1_4, decimals=2)

array([[ 2.71, -3.01],
       [ 2.05, -1.8 ],
       [ 5.97, -5.42]])

# Decoder

In [None]:
word_SOS2 = [1,0,0,0,0]
word_ir = [0,1,0,0,0]
word_vamos = [0,0,1,0,0]
word_y = [0,0,0,1,0]
word_EOS2 = [0,0,0,0,1]

word2vec_spanish = np.array(
    [[-2.53, -0.97], 
     [1.27,2.17],
     [0.71, 0.73],
     [0.58, 0.69],
     [-1.39, 0.76]]
)
word2vec_spanishT = np.transpose(word2vec)

In [15]:
sentence_matrix_spanish = np.array(
    [word_SOS2,
     word_vamos]
)

### Step 1: Word embedding

In [16]:
result2_1 = np.dot(sentence_matrix_spanish, word2vec_spanish)
np.around(result2_1, decimals=2)

array([[-2.53, -0.97],
       [ 0.71,  0.73]])

### Step 2: Add position

In [17]:
position_matrix2 = np.array(
    [[0, 1],
     [0.84, 0.54]]
)
result2_2 = result2_1+position_matrix2
np.around(result2_2, decimals=2)

array([[-2.53,  0.03],
       [ 1.55,  1.27]])

### Step 3: Transformer

In [30]:
W_Q2 = np.array(
    [[-0.19, 0.24],
     [0.64, 1.47]]
)
W_K2 = np.array(
    [[-0.08, 0.38],
     [1.18, 0.67]]
)
W_V2 = np.array(
    [[1.26, 1.10],
     [-0.71, 0.05]]
)

In [42]:
result2_3_Q = np.dot(result2_2, W_Q2)
result2_3_K = np.dot(result2_2, W_K2)
result2_3_V = np.dot(result2_2, W_V2)

result2_3_QK = np.dot(result2_3_Q, np.transpose(result2_3_K))/np.sqrt(2)
np.around(result2_3_QK, decimals=2)

array([[ 0.46, -0.09],
       [-1.4 ,  2.78]])

In [43]:
## set -0.09 as 0 to avoid checting
result2_mask = np.array(
    [[0, -np.inf],
     [0, 0]]
)
result2_3_QK_masked = result2_3_QK + result2_mask
np.around(result2_3_QK_masked, decimals=2)

array([[ 0.46,  -inf],
       [-1.4 ,  2.78]])

In [44]:
result2_3_QK_softmax = np.apply_along_axis(softmax, 1, result2_3_QK_masked)
np.around(result2_3_QK_softmax, decimals=2)

array([[1.  , 0.  ],
       [0.01, 0.99]])

In [41]:
result2_3 = np.dot(result2_3_QK_softmax, result2_3_V)
np.around(result2_3, decimals=2)

array([[-3.21, -2.78],
       [ 0.99,  1.7 ]])

### Step 4: Add resuidual connection

In [45]:
result2_4 = result2_3 + result2_2
np.around(result2_4, decimals=2)

array([[-5.74, -2.75],
       [ 2.54,  2.97]])

# Encoder-Decoder

In [47]:
W_Q3 = np.array(
    [[0.9, 1.32],
     [1, 0.38]]
)
W_K3 = np.array(
    [[0.94, 1.28],
     [-0.7, -0.97]]
)
W_V3 = np.array(
    [[-1.03, 1.73],
     [1.11, -1.49]]
)

### Step 1: Transformer

In [54]:
result3_3_Q = np.dot(result2_4, W_Q3)
result3_3_K = np.dot(result1_4, W_K3)
result3_3_V = np.dot(result1_4, W_V3)

result3_3_QK = np.dot(result3_3_Q, np.transpose(result3_3_K))/np.sqrt(2)
np.around(result3_3_QK, decimals=2)

array([[ -64.96,  -44.52, -131.34],
       [  37.5 ,   25.7 ,   75.82]])

In [55]:
result3_3_QK_softmax = np.apply_along_axis(softmax, 1, result3_3_QK)
np.around(result3_3_QK_softmax, decimals=2)

array([[0., 1., 0.],
       [0., 0., 1.]])

In [56]:
result3_3 = np.dot(result3_3_QK_softmax, result3_3_V)
np.around(result3_3, decimals=2)

array([[ -4.11,   6.23],
       [-12.17,  18.41]])

### Step 2: Add residual connection

In [59]:
result3_4 = result3_3 + result2_4
np.around(result3_4, decimals=2)

array([[-9.85,  3.48],
       [-9.63, 21.38]])

Put them in a neural network and expectthe output 

word_vamos = [0,0,1,0,0]

word_EOS2 = [0,0,0,0,1]

# CNN

In [63]:
cnn_matrix = np.array(
    [[0.44, -0.14, -1.73,0.67,0.3],
     [-0.37, -1.69, -0.53,-0.39,1.8]]
)
cnn_bias= np.array(
    [[-1.01, 0.21, 0.44,-1.42,0.15]]
)

In [64]:
result4_1 = np.dot(result3_4, cnn_matrix)+cnn_bias
np.around(result4_1, decimals=2)

array([[ -6.63,  -4.3 ,  15.64,  -9.38,   3.46],
       [-13.16, -34.58,   5.77, -16.21,  35.75]])

In [65]:
result4_1_softmax = np.apply_along_axis(softmax, 1, result4_1)
np.around(result4_1_softmax, decimals=2)

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])