In [57]:
import math
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader

import re


In [58]:
class Config:
    batch_size = 50 # b
    sequence_l = 128 # n
    d_model = 768 # d_model， embedding dim
    num_layer = 12 # number of block stacked
    number_head = 8 # multihead attention
    d_ff = 2048 # feedforward dimension

config_model = Config()

def read_data(): 
    text = open('data.txt', 'r').read()
    return re.sub('[^A-Za-z:]+',' ',text).strip().lower()

data = read_data()

# generate data

In [60]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):
        chars = sorted(list(set(data)))  # get characters from the input data
        self.stoi = {ch: i for i, ch in enumerate(chars)}  # map characters to integer indices
        self.itos = {i: ch for i, ch in enumerate(chars)}

        self.block_size = config.sequence_l
        self.data = data

    def get_vocab_size(self):
        return len(self.stoi)

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        idx_chunk = [self.stoi[c] for c in chunk]
        x = torch.tensor(idx_chunk[:-1], dtype=torch.long)
        # return the chunk and the shifted version as tensors
        y = torch.tensor(idx_chunk[1:], dtype=torch.long)

        return x, y
    
char_dataset = CharDataset(config_model, data)
# Create a DataLoader
data_loader = DataLoader(char_dataset, batch_size=config_model.batch_size,shuffle=True)


In [61]:
# just take first batch for testing embedding
for ind,(x,y) in enumerate(data_loader):
    if ind != 1:
        break

28

# 1. embedding:

In [62]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        # pe: [seq_lens * 1 * d_model] for each sample

        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [63]:
def create_causal_mask(config_model):
    mask = torch.full([config_model.sequence_l, config_model.sequence_l] , float('-inf'))
    mask = torch.triu(mask, diagonal=1)
    return mask

In [None]:
class LayerNormalization(nn.Module):


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderBlock, self).__init__()   
        self.self_attention =  MultiHeadAttention(d_model,num_heads)
        self.norm1 = LayerNormalization()
        self.ffn 
        self.norm2 = LayerNormalization()


    def forward(self,x,y,mask):
        _x = x # used for skip connection
        x = self.self_attention(x,mask) 
        x = self.norm1(x+_x)

        _x = x
        x = self.ffn(x)
        x = self.norm2(x+_x)



In [73]:
class SequentialDecoder(nn.Sequential): # just un intermediate function for calling
    def forward(self, x,y,mask):
        for module in self._modules.values():
            y = module(x,y,mask)
            return y


In [74]:
mask =  create_causal_mask(config_model) #[sequence_l * sequence_l]

class Decoder(nn.Module):
    def __init__(self, d_model, src_vovab_size, ffn_hidden, num_heads, drop_prob, num_layers=1):
        super().__init__()
        
        self.tok_emb = nn.Embedding(src_vovab_size,d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        
        self.layers = SequentialDecoder(*[DecoderBlock(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])
    
    def forward(self, x,y,mask):

        emb_x = self.tok_emb(x)
        emb_x = self.pos_emb(emb_x) # x,y = emb = [batch size * sequence_l * d_model]
        emb_x = self.dropout1(emb_x)

        y = self.layers(emb_x,y,mask)

In [75]:
src_vovab_size = char_dataset.get_vocab_size()

dec = Decoder(config_model.d_model,src_vovab_size,config_model.d_ff,config_model.number_head,0.5)
output = dec(x)
print(output.shape)

NameError: name 'DecoderLayer' is not defined

In [None]:
class DotProductAttention(nn.Module): 
    """Scaled dot product attention."""
    def __init__(self, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        