# Init

In [96]:
import os
import math

import numpy as np
import fasttext.util

import nn
import utils

%load_ext autoreload
%autoreload 2

MODEL_DIM = 256
INNER_DIM = 1024 

NUM_HEADS = 8

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
embeddings = utils.get_embeddings(["en", "fr"], dim=MODEL_DIM) # using dim 256 instead of 512

en_emb = embeddings["en"]
fr_emb = embeddings["fr"]

print(en_emb.get_dimension())
print(fr_emb.get_dimension())



256
256


In [104]:
class TranslationDataset():
    def __init__(self, inputs, targets, embeddings):
        self.inputs = inputs 
        self.targets = targets 
        
        # Encoders for both languages
        en_i = embeddings["en"]
        en_t = embeddings["fr"]
        
        self.input_em = []
        for seq in inputs:
            self.input_em.append(np.array([en_i.get_word_vector(w) for w in seq.split()]))
            
        self.target_em = []
        for seq in targets:
            self.target_em.append(np.array([en_t.get_word_vector(w) for w in seq]))

    def __len__(self):
        return (len(self.sequence))

    def __getitem__(self, idx):
        return({
            "input":self.inputs[idx],
            "target":self.targets[idx],
            "input_embedding":self.input_em[idx],
            "target_embedding":self.target_em[idx],
        })
    
data = ["the cat likes oranges", "hello my friend"]
target = ["le chat aime les oranges", "bonjour mon amie"]

dataset = TranslationDataset(data, target, embeddings) 

In [105]:
def pos_encoding(seq):
    """Adds positional encoding to a sequence of word vectors"""
    seq_len = seq.shape[0]
    d_model = seq.shape[1]
    
    encoding = []
    for i in range(seq_len):
        w = 1 / (10000 ** ((2 * i) / d_model))

        wi_s = [math.sin(p * w) * (i % 2) for p in range(d_model)]
        wi_c = [math.cos(p * w) * ((i + 1) % 2) for p in range(d_model)]
        
        encoding.append(np.add(wi_s, wi_c))
        
    encoding = np.array(encoding)
    
    return np.add(encoding, seq)

In [207]:
class AttentionHead():
    """Scaled dot product attention head. """
    def __init__(self, embed_dim, n_heads):
        self.embed_dim = embed_dim
        self.n_heads = n_heads 
        self.output_dim = embed_dim // n_heads
        
        self.V = nn.LinearLayer(embed_dim, self.output_dim)
        self.K = nn.LinearLayer(embed_dim, self.output_dim)
        self.Q = nn.LinearLayer(embed_dim, self.output_dim)
    
    def __call__(self, x, mask=None):
        """Attention forward pass"""
        d_k = self.embed_dim // self.output_dim
        scale = math.sqrt(d_k)
        
        scaled_dp = np.dot(self.Q(x.T), self.K(x.T).T) / scale
        
        if mask is not None:
            scaled_dp = np.add(mask, scaled_dp)
            
        return(np.matmul(nn.softmax(scaled_dp), self.V(x.T)))
        
        
class MultiheadAttention():
    """Multiheaded attention transformer block"""
    def __init__(self, embed_dim=MODEL_DIM, n_heads=NUM_HEADS, masked=False):
        self.heads = [AttentionHead(embed_dim, n_heads) for _ in range(n_heads)]
        self.O = nn.LinearLayer(embed_dim, embed_dim)
        
        self.attn_dim = embed_dim // n_heads
        self.masked = masked
    
    def __call__(self, x):
        if self.masked:
            mask = np.triu(np.ones((self.attn_dim, self.attn_dim)) * -np.inf, k=1)
            
        head_sum = np.concatenate(np.array([h(x, mask) for h in self.heads]))
        
        return(self.O(head_sum))

x = dataset[0]["input_embedding"]

multihead_attn = MultiheadAttention(masked=True)

print(multihead_attn(x))

[[-5.73479643e-03 -2.81575693e-02  6.38276504e-02  1.12585503e-02]
 [-7.80738386e-03  1.33358979e-01  2.14657868e-02 -2.85409773e-02]
 [ 8.26628697e-03  3.91261445e-02 -1.40601615e-02  1.20711907e-02]
 ...
 [-6.68998012e-02 -1.54699273e-02 -4.40439950e-02  8.19583891e-05]
 [-5.27294279e-02  5.81298962e-02 -3.27853882e-02  2.88939011e-03]
 [-3.31982013e-02  4.41935133e-02 -7.06124802e-02  1.82914396e-02]]


In [178]:
class FFN(nn.Net):
    """Position-wise feed forward nueral network"""
    def __init__(self, embed_dim=MODEL_DIM, inner_dim=INNER_DIM):
        self.L1 = nn.LinearLayer(embed_dim, inner_dim)
        self.L2 = nn.LinearLayer(inner_dim, embed_dim)
    
    def __call__(self, x):
        """ReLU(xW1 + b1)W2 + b2"""
        x = self.L2(np.maximum(self.L1(x), 0))
        
        return x

In [177]:
#TODO implement layer norm
def layer_norm(layer):
    std = np.std(layer)
    mean = np.mean(layer)
    print(std, mean)
    print(layer)
        
x = dataset[0]["input_embedding"]

multihead_attn = MultiheadAttention()
ffn = FFN()
x = multihead_attn(x)

x = ffn(x)

layer_norm(x)



UnboundLocalError: local variable 'mask' referenced before assignment

In [None]:

class EncoderBlock():
    def __init__(self):
        
    def __call__(self, x):