In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import math

read the data in from txt file

In [2]:
with open('input.txt', 'r') as file:
    lines = file.readlines()

lines = [line.strip() for line in lines] # remove new line char
lines = [line for line in lines if line != ""] # remove empties
lines = list(map(str.lower, lines)) # make everything lower to reduce vocab

In [3]:
# now let's make everything one long string

data = ' '.join(lines)
chars = sorted(set(data))

print(len(data))


1108166


In [4]:
tokenize = {char : index for index, char in enumerate(chars) }
detokenize = {char : index for char, index in enumerate (list(tokenize.keys()))}

In [5]:
tokens = []

for char in data:
    tokens.append(tokenize[char])

In [6]:
num_embeddings = len(tokenize.keys())
embedding_dim = 50

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=50):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len


        self.pos_encoding = torch.zeros(max_len, d_model)
        positions = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        self.pos_encoding[:, 0::2] = torch.sin(positions * div_term)
        self.pos_encoding[:, 1::2] = torch.cos(positions * div_term)

        self.register_buffer('penc', self.pos_encoding)       

    def forward(self, x):
        seq_len = x.size(1)
        return self.pos_encoding[:seq_len, :].unsqueeze(0)



In [72]:
def scaled_dot_product_attention(q, k, v, mask=None):

    matmul_qk = torch.matmul(q, k.transpose(-1, -2))
    depth = k.size(-1)
    logits = matmul_qk / math.sqrt(depth)

    # if masking it would be done here, not doing this now

    attention_weights = torch.nn.functional.softmax(logits, dim=-1)
    output = torch.matmul(attention_weights, v)

    return output, attention_weights


class MHA(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MHA, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model

        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.final_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.n_heads, self.d_model // self.n_heads)
        return x.permute(0, 2, 1, 3)

    def forward(self, x):
        batch_size = x.size(0)

        q = self.split_heads(self.wq(x), batch_size)
        k = self.split_heads(self.wk(x), batch_size)
        v = self.split_heads(self.wv(x), batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()

        concat_attention = scaled_attention.view(batch_size, -1, self.d_model)
        output = self.final_linear(concat_attention)

        return output, attention_weights


In [83]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.pos_enc = PositionalEncoding(d_model=embedding_dim, max_len=50)
        self.mha = MHA(d_model=embedding_dim, n_heads=5)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        out, attn_weights = self.mha(x)
        return out
        
        

In [84]:
model = Model(num_embeddings, embedding_dim)

In [86]:
x = torch.randint(0, 9, (1, 7))
o = model(x)
o.shape

torch.Size([1, 7, 50])