In [266]:
import torch
from tqdm import tqdm

In [99]:
raw_names = open("names.txt").read().split("\n")

In [122]:
names = [list(n) for n in raw_names]
names[:5]

[['e', 'm', 'm', 'a'],
 ['o', 'l', 'i', 'v', 'i', 'a'],
 ['a', 'v', 'a'],
 ['i', 's', 'a', 'b', 'e', 'l', 'l', 'a'],
 ['s', 'o', 'p', 'h', 'i', 'a']]

In [123]:

vocab_size = len(vocab)

ch_to_i = {ch:idx for (idx, ch) in enumerate(vocab)}
i_to_ch = {idx:ch for (idx, ch) in enumerate(vocab)}

In [124]:
# char embeddings
d_model = torch.tensor(16)
embeddings = torch.randn((vocab_size, d_model), dtype=torch.float)

In [125]:
# positional encoding

def get_positional_encoding(pos, model_dims):
    v_dims = torch.arange(0, (model_dims+1)/2, step=1, dtype=torch.float)
    
    def get_pe(v):
        exponent = (2*v)/model_dims
        return pos/torch.pow(10000,exponent)
    
    pre_sinusoid = get_pe(v_dims)
    pe_even = pre_sinusoid.sin()
    pe_odd = pre_sinusoid.cos() 
    
    pe = torch.stack([pe_even, pe_odd])
    
    return pe.mT.reshape(-1)[:model_dims]

In [126]:
def encode_name(name):
    one_hot = torch.nn.functional.one_hot(torch.tensor([ch_to_i[ch] for ch in name]), vocab_size).float()
    
    embedded_chars = one_hot @ embeddings
    
    encoded_name = torch.stack([emb + get_positional_encoding(pos, d_model) for (pos, emb) in enumerate(embedded_chars)])
    
    return encoded_name

In [209]:
# single head attention
def single_head_attention(q_in, k_in, v_in, q_l, k_l, v_l, scale_dims, masked):
    # linear layers
    Q = q_in @ q_l
    K = k_in @ k_l
    V = v_in @ v_l
    
    # scaled dot-product attention
    mat_mul = Q @ K.T
    if(masked):
        mask = torch.tril(torch.ones(mat_mul.shape))
        mat_mul = torch.where(mask == 0, float("-inf"), mat_mul)
    scale = mat_mul/torch.sqrt(scale_dims)
    softmax = torch.nn.functional.softmax(scale, dim=None)
    return softmax @ V

In [238]:
def multi_head_attention(q_in, k_in, v_in, n_heads, params, masked=False):
    out_params = params[0]
    qkv_param_groups = params[1:]

    if(len(qkv_param_groups) != n_heads):
        raise Exception(f"Params doesn't match num of heads. Expected: {n_heads}. Found: {len(qkv_param_groups)}")
        
    head_dims = d_model / n_heads;
    
    head_results = torch.tensor([])
    for head_idx in range(n_heads):
        [q_l, k_l, v_l] = qkv_param_groups[head_idx]        
        head_res = single_head_attention(q_in, k_in, v_in, q_l, k_l, v_l, head_dims, masked)
        
        head_results = torch.cat([head_results, head_res], dim=1)
        
    # final linear layer
    return head_results @ out_params

In [211]:
def generate_qkv_mats(model_dims, head_dims):
    model_dims = model_dims.int()
    head_dims = head_dims.int()
    return [
        torch.randn((model_dims, head_dims)),
        torch.randn((model_dims, head_dims)),
        torch.randn((model_dims, head_dims)),
    ]

In [271]:
def layer_norm(inp, scale, bias):
    mean = inp.mean(dim=1).unsqueeze(dim=1)
    std = inp.std(dim=1).unsqueeze(dim=1)
    
    norm = (inp - mean)/std
    
    return (norm * scale) + bias

def transformer_encoder_single(d_model, n_heads, encoded_input, params):
    # separate params
    ff_params = params[:4]
    ln_params = params[4: 8]
    head_params = params[8:]
    
    [ln_s1, ln_b1, ln_s2, ln_b2] = ln_params

    # multi-head attention
    mha = multi_head_attention(encoded_input, encoded_input, encoded_input, n_heads, head_params, False)
    
    # residual connection
    mha_res = layer_norm(mha + encoded_input, ln_s1, ln_b1)
    
    # feed-forward
    [W1, b1, W2, b2] = ff_params
    ff = (((mha_res @ W1) + b1).relu() @ W2) + b2
    
    # residual connection and norm
    return layer_norm(ff + mha_res, ln_s2, ln_b2)

In [221]:
# output becomes values (V) and keys (K) to be queried (Q) by decoder input.

def transformer_encoder_stack(d_model, n_encoders, n_heads, encoded_input, param_groups):
    if(len(param_groups) != n_encoders):
        raise Exception("n_encoders not compatible with params")
        
    ff_h_size = d_model*4
    
    # feed through stack in sequence
    encoder_outputs = []
    res = encoded_input
    for param_group in param_groups:
        res = transformer_encoder_single(d_model, n_heads, res, param_group)
        encoder_outputs += [res]
    return encoder_outputs

In [259]:
def transformer_decoder_single(d_model, n_masked_heads, n_heads, encoded_input, encoder_output, params):
    # separate params
    
    # take masked head params
    masked_head_params = params[:n_masked_heads+1]
    params = params[n_masked_heads+1:]
    
    # take first layer norm params
    [ln_1_scale, ln_1_bias] = params[:2]
    params = params[2:]
    
    # take head params
    head_params = params[:n_heads+1]
    params = params[n_heads+1:]
    
    # take second layer norm params
    [ln_2_scale, ln_2_bias] = params[:2]
    params = params[2:]
    
    # feed-forward params
    [W1, b1, W2, b2] = params[:4]
    params = params[4:]
    
    # take third layer norm params
    [ln_3_scale, ln_3_bias] = params[:2]
    params = params[2:]
    
    if(len(params) != 0):
        raise Exception(f"Params mismatch, {len(params)} left")
    
    # masked multi-head
    masked_mha = multi_head_attention(encoded_input, encoded_input, encoded_input, n_masked_heads, masked_head_params, True)
    # residual connection
    masked_mha_res = masked_mha + encoded_input
    # layer norm
    masked_mha_out = layer_norm(masked_mha_res, ln_1_scale, ln_1_bias)
    
    # multi-head (Q - masked_mha_out, K and V - corresponding encoder output)
    mha = multi_head_attention(masked_mha_out, encoder_output, encoder_output, n_heads, head_params, False)
    # residual connection
    mha_res = mha + masked_mha_out
    # layer norm
    mha_out = layer_norm(mha_res, ln_2_scale, ln_2_bias)
    
    # feed-forward
    ff = (((mha_out @ W1) + b1).relu() @ W2) + b2
    # residual connection
    ff_res = ff + mha_out
    # layer norm
    return layer_norm(ff_res, ln_3_scale, ln_3_bias)
    

In [270]:
encoding = encode_name(names[3])

ff_h_size = d_model*4

encoder_params = [
    [
        # feed-foward params
        # W1
        torch.randn((d_model, ff_h_size)),
        # B1
        torch.randn((ff_h_size)),
        # W2
        torch.randn((ff_h_size, d_model)),
        # B4
        torch.randn((d_model)),
        # layer norm params
        torch.randn((d_model)),
        torch.randn((d_model)),
        torch.randn((d_model)),
        torch.randn((d_model)),
        # multi-head attention params
        torch.randn((d_model, d_model)),
        generate_qkv_mats(d_model, d_model/4),
        generate_qkv_mats(d_model, d_model/4),
        generate_qkv_mats(d_model, d_model/4),
        generate_qkv_mats(d_model, d_model/4)
    ],
]

decoder_params = [
    # masked_head_params
    torch.randn((d_model, d_model)),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    # layer norm params
    torch.randn((d_model)),
    torch.randn((d_model)),
    # head params
    torch.randn((d_model, d_model)),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    generate_qkv_mats(d_model, d_model/4),
    # layer norm params
    torch.randn((d_model)),
    torch.randn((d_model)),
    # feed-foward params
    # W1
    torch.randn((d_model, ff_h_size)),
    # B1
    torch.randn((ff_h_size)),
    # W2
    torch.randn((ff_h_size, d_model)),
    # B4
    torch.randn((d_model)),
    # layer norm params
    torch.randn((d_model)),
    torch.randn((d_model)),
]

encoder_result = transformer_encoder_stack(d_model, 1, 4, encoding, encoder_params)
decoder_result = transformer_decoder_single(d_model, 4, 4, encoding, encoder_result[0], decoder_params)

  softmax = torch.nn.functional.softmax(scale, dim=None)
