In [70]:
## Init

In [71]:
import os
import math

import numpy as np
import fasttext.util

import nn
import utils

%load_ext autoreload
%autoreload 2

MODEL_DIM = 256
NUM_HEADS = 8

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
embeddings = utils.get_embeddings(["en", "fr"], dim=MODEL_DIM) # using dim 256 instead of 512

en_emb = embeddings["en"]
fr_emb = embeddings["fr"]

print(en_emb.get_dimension())
print(fr_emb.get_dimension())



256
256


In [4]:
class TranslationDataset():
    def __init__(self, inputs, targets, embeddings):
        self.inputs = inputs 
        self.targets = targets 
        
        # Encoders for both languages
        en_i = embeddings["en"]
        en_t = embeddings["fr"]
        
        self.input_em = []
        for seq in inputs:
            self.input_em.append(np.array([en_i.get_word_vector(w) for w in seq.split()]))
            
        self.target_em = []
        for seq in targets:
            self.target_em.append(np.array([en_t.get_word_vector(w) for w in seq]))

    def __len__(self):
        return (len(self.sequence))

    def __getitem__(self, idx):
        return({
            "input":self.inputs[idx],
            "target":self.targets[idx],
            "input_embedding":self.input_em[idx],
            "target_embedding":self.target_em[idx],
        })
    
data = ["the cat likes oranges", "hello my friend"]
target = ["le chat aime les oranges", "bonjour mon amie"]

dataset = TranslationDataset(data, target, embeddings) 

print(dataset[0])

{'input': 'the cat likes oranges', 'target': 'le chat aime les oranges', 'input_embedding': array([[ 0.16699061, -0.1185919 ,  0.02268532, ...,  0.05452403,
         0.00290791, -0.0578087 ],
       [ 0.01792765, -0.1697452 , -0.2524293 , ...,  0.02038066,
        -0.03436632,  0.03158564],
       [ 0.1284513 , -0.01115857, -0.10007418, ...,  0.0344702 ,
        -0.01029698,  0.02521799],
       [-0.0449486 , -0.11437774, -0.08717595, ...,  0.00913693,
        -0.08598089, -0.07585137]], dtype=float32), 'target_embedding': array([[ 0.10709047, -0.5281988 ,  0.01931942, ...,  0.0100735 ,
         0.0801144 , -0.14083575],
       [-0.6774422 , -0.5310623 ,  0.30984998, ...,  0.35692137,
         0.05885062, -0.02483021],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.1296873 , -0.97862786,  0.44244942, ...,  0.15799554,
         0.14931174, -0.18530309],
       [-0.6774422 , -0.5310623 ,  0.30984998, ...,  0.3569

In [5]:
def pos_encoding(seq):
    """Adds positional encoding to a sequence of word vectors"""
    seq_len = seq.shape[0]
    d_model = seq.shape[1]
    
    encoding = []
    for i in range(seq_len):
        w = 1 / (10000 ** ((2 * i) / d_model))

        wi_s = [math.sin(p * w) * (i % 2) for p in range(d_model)]
        wi_c = [math.cos(p * w) * ((i + 1) % 2) for p in range(d_model)]
        
        encoding.append(np.add(wi_s, wi_c))
        
    encoding = np.array(encoding)
    
    return np.add(encoding, seq)
    

print(dataset[0]["input"])

pos_encoding(dataset[0]["input_embedding"])

the cat likes oranges


array([[ 1.16699061,  0.42171041, -0.39346152, ..., -0.04709166,
        -0.88911059, -0.92011231],
       [ 0.01792765,  0.63221659,  0.70571507, ...,  0.20405458,
        -0.71296226, -0.96284121],
       [ 1.1284513 ,  0.6367473 , -0.26051014, ...,  0.71485267,
         0.98875911,  0.63942415],
       [-0.0449486 ,  0.60703638,  0.91198825, ...,  0.32887116,
        -0.54810881, -1.03563637]])

In [None]:
class AttentionHead():
    """Scaled dot product attention head. """
    def __init__(self, embed_dim, n_heads):
        self.embed_dim = embed_dim
        self.n_heads = n_heads 
        self.output_dim = embed_dim // n_heads
        
        self.V = nn.LinearLayer(embed_dim, self.output_dim)
        self.K = nn.LinearLayer(embed_dim, self.output_dim)
        self.Q = nn.LinearLayer(embed_dim, self.output_dim)
    
    def __call__(self, x):
        """Attention forward pass"""
        d_k = self.embed_dim // self.output_dim
        scale = math.sqrt(d_k)
        
        scaled_dp = np.dot(self.Q(x), self.K(x).T) / scale
        
        return(np.matmul(nn.softmax(scaled_dp), self.V(x)))
        
        
class MultiheadAttention():
    """Multiheaded attention transformer block"""
    def __init__(self, embed_dim=MODEL_DIM, n_heads=NUM_HEADS, mask=None):
        self.heads = [AttentionHead(embed_dim, n_heads) for _ in range(n_heads)]
        self.O = nn.LinearLayer(embed_dim, embed_dim)
    
    def __call__(self, x):
        head_sum = np.concatenate(np.array([h(x) for h in self.heads]))
        
        return(self.O(head_sum.T))
            
        
x = dataset[0]["input_embedding"]

multihead_attn = MultiheadAttention()
multihead_attn(x)

In [None]:
class FFN():
    
