# Init

In [96]:
import os
import math

import numpy as np
import fasttext.util

import nn
import utils

%load_ext autoreload
%autoreload 2

MODEL_DIM = 256
INNER_DIM = 1024 

NUM_HEADS = 8

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
embeddings = utils.get_embeddings(["en", "fr"], dim=MODEL_DIM) # using dim 256 instead of 512

en_emb = embeddings["en"]
fr_emb = embeddings["fr"]

print(en_emb.get_dimension())
print(fr_emb.get_dimension())



256
256


In [218]:
class TranslationDataset():
    """Dataset for the position encoded and word embedded translations"""
    def __init__(self, inputs, targets, embeddings):
        self.inputs = inputs 
        self.targets = targets 
        
        # Encoders for both languages
        emb_in = embeddings["en"]
        emb_tgt = embeddings["fr"]
        
        self.input_em = []
        for seq in inputs:
            self.input_em.append(np.array([emb_in.get_word_vector(w) for w in seq.split()]))
            
        self.target_em = []
        for seq in targets:
            self.target_em.append(np.array([emb_tgt.get_word_vector(w) for w in seq]))

    def __len__(self):
        return (len(self.sequence))

    def __getitem__(self, idx):
        return({
            "input":self.inputs[idx],
            "target":self.targets[idx],
            "input_embedding":self.input_em[idx],
            "target_embedding":self.target_em[idx],
        })
    
data = ["the cat likes oranges", "hello my friend"]
target = ["le chat aime les oranges", "bonjour mon amie"]

dataset = TranslationDataset(data, target, embeddings) 

In [105]:
def pos_encoding(seq):
    """Adds positional encoding to a sequence of word vectors"""
    seq_len = seq.shape[0]
    d_model = seq.shape[1]
    
    encoding = []
    for i in range(seq_len):
        w = 1 / (10000 ** ((2 * i) / d_model))

        wi_s = [math.sin(p * w) * (i % 2) for p in range(d_model)]
        wi_c = [math.cos(p * w) * ((i + 1) % 2) for p in range(d_model)]
        
        encoding.append(np.add(wi_s, wi_c))
        
    encoding = np.array(encoding)
    
    return np.add(encoding, seq)

In [232]:
class AttentionHead():
    """Scaled dot product attention head. """
    def __init__(self, embed_dim, n_heads):
        self.embed_dim = embed_dim
        self.n_heads = n_heads 
        self.output_dim = embed_dim // n_heads
        
        self.V = nn.LinearLayer(embed_dim, self.output_dim)
        self.K = nn.LinearLayer(embed_dim, self.output_dim)
        self.Q = nn.LinearLayer(embed_dim, self.output_dim)
    
    def __call__(self, x, mask=None):
        """Attention forward pass"""
        d_k = self.embed_dim // self.output_dim
        scale = math.sqrt(d_k)
        
        scaled_dp = np.dot(self.Q(x.T), self.K(x.T).T) / scale
        
        if mask is not None:
            scaled_dp = np.add(mask, scaled_dp)
            
        return(np.matmul(nn.softmax(scaled_dp), self.V(x.T)))
        
        
class MultiheadAttention():
    """Multiheaded attention transformer block"""
    def __init__(self, embed_dim=MODEL_DIM, n_heads=NUM_HEADS, masked=False):
        self.heads = [AttentionHead(embed_dim, n_heads) for _ in range(n_heads)]
        self.O = nn.LinearLayer(embed_dim, embed_dim)
        
        self.attn_dim = embed_dim // n_heads
        
        # If the attention block is masked
        self.mask = None
        if masked:
            self.mask = np.ones((self.attn_dim, self.attn_dim)) * -np.inf
            self.mask = np.triu(self.mask, k=1)
    
    def __call__(self, x):
        h_cat = np.concatenate(np.array([h(x, self.mask) for h in self.heads]))
        
        return(self.O(h_cat))

In [252]:
class FFN(nn.Net):
    """Position-wise feed forward nueral network"""
    def __init__(self, embed_dim=MODEL_DIM, inner_dim=INNER_DIM):
        self.L1 = nn.LinearLayer(embed_dim, inner_dim)
        self.L2 = nn.LinearLayer(inner_dim, embed_dim)
        
        self.layers = [
            self.L1,
            self.L2,
        ]
    
    def __call__(self, x):
        """ReLU(xW1 + b1)W2 + b2"""
        x = self.L2(np.maximum(self.L1(x), 0))
        
        return x

In [221]:
#TODO implement layer norm
def layer_norm(layer):
    std = np.std(layer)
    mean = np.mean(layer)
    print(std, mean)
    print(layer)


0.04877990447589142 0.002209595803151198
[[ 0.03843306 -0.0500214   0.00269599 -0.02407348]
 [ 0.01275713  0.02781592 -0.00545702 -0.00700161]
 [ 0.04701671  0.05677232  0.04480049  0.07687742]
 ...
 [ 0.01060016  0.22381933  0.0216572   0.02098005]
 [-0.00321117 -0.0919823   0.00117171 -0.02113249]
 [ 0.01688006 -0.03274229  0.0186762   0.00636883]]


In [259]:
class EncoderBlock():
    def __init__(self, embed_dim=MODEL_DIM, n_heads=NUM_HEADS, inner_dim=INNER_DIM):
        self.embed_dim = embed_dim
        self.n_heads = n_heads 
        
        self.multihead_attn = MultiheadAttention(embed_dim, n_heads)
        self.feedforward = FFN(embed_dim, inner_dim)
    
    def temp_add_norm(self, x, y):
        # TODO replace with layer norm function
        x = np.add(x, y) 
        return (x / np.sqrt(np.sum(x**2))).T
        
    def __call__(self, x):
        y = self.multihead_attn(x)
        x = self.temp_add_norm(x, y.T)
        
        y = self.feedforward(x)
        x = self.temp_add_norm(x, y)
        
        return x
    
#x = dataset[0]["input_embedding"]
#enc_block = EncoderBlock()
#print(x)
#print(enc_block(x))

x = dataset[0]["input_embedding"]
t = dataset[0]["target_embedding"]

multihead_attn = MultiheadAttention()
ffn = FFN()
x = multihead_attn(x)

x = ffn(x)
x = ffn.backprop(x, t)

layer_norm(x)
enc_block = EncoderBlock()

[[ 0.10709047 -0.5281988   0.01931942 ...  0.0100735   0.0801144
  -0.14083575]
 [-0.6774422  -0.5310623   0.30984998 ...  0.35692137  0.05885062
  -0.02483021]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-1.1296873  -0.97862786  0.44244942 ...  0.15799554  0.14931174
  -0.18530309]
 [-0.6774422  -0.5310623   0.30984998 ...  0.35692137  0.05885062
  -0.02483021]
 [-0.19802126 -0.5308491   0.63903207 ... -0.0025325  -0.23603661
  -0.27881238]] [[ 0.06817749  0.21359808  0.02391365  0.07271023]
 [-0.00742031 -0.02742875 -0.01343396 -0.00785637]
 [ 0.02424399  0.14561284  0.05859317  0.04512634]
 ...
 [ 0.01618782 -0.03645     0.02920059 -0.02640793]
 [ 0.02562935 -0.07878374 -0.00713749  0.00048515]
 [ 0.00139517  0.01255289 -0.00135322 -0.01087015]]


ValueError: operands could not be broadcast together with shapes (1024,) (1024,4) 