In [1]:
import math
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
import re
from torch.nn import functional as F


In [2]:
split_ratio = 0.9

# model params
batch_size = 50 # b, to be changed
sequence_l = 128 # n
d_model = 768 # d_model， embedding dim
num_layer = 12 # number of blocks stacked
number_head = 8 # multihead attention
d_ff = 2048 # feedforward dimension
dropout = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
def build_look_up(text): 
    chars = sorted(list(set(text)))
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(stoi) }
    return stoi,itos

def encode(text, stoi):
    return [stoi[c] for c in text]

def decode(idx, itos):
    return [itos[i] for i in idx]

In [13]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train if split == 'train' else val
    ix = torch.randint(len(data) - sequence_l, (batch_size,1))
    x = torch.stack([data[i:i+sequence_l] for i in ix])
    y = torch.stack([data[i+1:i+sequence_l+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# get data and preprocess

In [14]:
with open('data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# look up and inerse lookup
stoi,itos = build_look_up(text)

# encode data 
data = torch.tensor(encode(text,stoi),dtype=torch.long)

# split train and test
train = data[:int(split_ratio*len(data))]
val = data[int(split_ratio*len(data)):]

# get batch of data
x,y = get_batch('train')

# model 

In [15]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        # pe: [seq_lens * 1 * d_model] for each sample

        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [16]:
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(d_model,head_size,bias=False)
        self.query = nn.Linear(d_model,head_size,bias=False)
        self.value = nn.Linear(d_model,head_size,bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_l, sequence_l)))


    def forward(self,x): # x:[batch, l_seq, d_model]
        k = self.key(x) # k:[batch, l_seq, head_size]
        q = self.query(x) # q:[batch, l_seq, head_size]
        v = self.value(x) # v:[batch, l_seq, head_size]
        qkt = q@k.transpose(2,1)/self.head_size**0.5 #[batch*l_seq*l_seq]  
        qkt = qkt.masked_fill(self.tril == 0, float('-inf'))
        qkt = F.softmax(qkt, dim = -1)
        z = qkt@v # z:[batch * l_seq*l_seq]@[batch, l_seq, head_size] = [batch, l_seq, head_size]
        return z




In [25]:
class MultiHeadAttention(nn.Module):
    def __init__(self,number_head,head_size):
        super().__init__()
        self.self_attention = nn.ModuleList([Head(head_size) for _ in range(number_head)])
        self.w0 = nn.Linear(head_size*number_head,d_model)

    def forward(self,x):
        head_outputs = [head(x) for head in self.self_attention]
        output = torch.cat(head_outputs, dim=-1) # [batch, l_seq, head_size*number_head]
        output = self.w0(output) # output:[batch, l_seq, d_model], so that it can be added with residual 
        return output
        

In [43]:
# The Multi-Heads Self-Attention mechanism is followed by two fully connected layers of 
# the Feed Forward block. The first (hidden) layer contains 4 times as many neurons as the input 
# sequence with the ReLU activation function. The dimension of the second layer is 
# equal to the dimension of the input sequence, and neurons do not use the activation function.
class FeedForward(nn.Module):
    def __init__(self,d_model):
        super().__init__()
        self.ff=nn.Sequential( nn.Linear(d_model,4*d_model),
                              nn.ReLU(),
                              nn.Linear(4*d_model,d_model))
    def forward(self,x):
        x = self.ff(x)
        return x

In [40]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = d_model // number_head
        
        self.self_attention = MultiHeadAttention(number_head,head_size)
        
        self.norm1  = nn.LayerNorm(d_model)
        
        self.ffn = FeedForward(d_model)
        
        self.norm2  = nn.LayerNorm(d_model)###
        
    def forward(self,x):
        x = x + self.self_attention(self.norm1(x))
        out = x + self.ffn(self.norm2(x))
         
        return out    





In [41]:
# Model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.tok_emb = nn.Embedding(len(stoi),d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        self.blocks = nn.Sequential(*[Block() for _ in range(num_layer)])
        self.norm_final = nn.LayerNorm(d_model)
    def forward(self, x,y):

        emb_x = self.tok_emb(x)
        emb_x = self.pos_emb(emb_x) # x,y = emb = [batch size * sequence_l * d_model]
        emb_x = self.dropout1(emb_x)

        emb_x = self.blocks(emb_x)
        x = self.norm_final(emb_x)
        

        # print(emb_x)
        

In [45]:
model = Model()
model(x,y)


tensor([[[-1.3893e+00, -1.1934e+00, -1.5779e+00,  ...,  4.9770e-01,
          -2.4424e+00,  7.4357e-02],
         [-5.1745e-01,  1.0291e-01, -2.9417e+00,  ...,  2.9044e-01,
          -1.0203e+00,  6.7494e-01],
         [ 3.3403e-01, -7.6859e-01, -2.4088e+00,  ...,  9.8355e-01,
          -1.5803e+00,  1.5424e+00],
         ...,
         [-1.0217e+00,  1.2565e+00, -1.4113e+00,  ...,  1.0118e+00,
          -1.8616e+00,  1.5940e+00],
         [-9.0679e-01, -6.3130e-03, -1.7194e+00,  ...,  2.7299e-01,
          -2.6300e+00,  1.6786e+00],
         [-1.6497e+00, -1.0639e+00, -1.0715e+00,  ...,  9.6943e-01,
          -9.6611e-01,  2.5279e+00]],

        [[-3.1870e-01, -8.3321e-01, -1.3710e+00,  ...,  9.6635e-01,
          -8.1163e-01,  8.7484e-01],
         [-3.5019e-01, -1.4143e+00, -9.4153e-01,  ...,  1.0489e+00,
          -9.2517e-01,  1.1385e-01],
         [ 3.2331e-01, -8.1991e-01, -4.6288e-01,  ...,  8.4662e-01,
          -2.8322e+00,  7.0098e-01],
         ...,
         [-1.4333e-01,  4