In [1]:
import math
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
import re

In [31]:
split_ratio = 0.9

# model params
batch_size = 50 # b, to be changed
sequence_l = 128 # n
d_model = 768 # d_model， embedding dim
num_layer = 12 # number of blocks stacked
number_head = 8 # multihead attention
d_ff = 2048 # feedforward dimension
dropout = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
def build_look_up(text): 
    chars = sorted(list(set(text)))
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(stoi) }
    return stoi,itos

def encode(text, stoi):
    return [stoi[c] for c in text]

def decode(idx, itos):
    return [itos[i] for i in idx]

In [24]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train if split == 'train' else val
    ix = torch.randint(len(data) - sequence_l, (batch_size,1))
    x = torch.stack([data[i:i+sequence_l] for i in ix])
    y = torch.stack([data[i+1:i+sequence_l+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# get data and preprocess

In [30]:
with open('data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# look up and inerse lookup
stoi,itos = build_look_up(text)

# encode data 
data = torch.tensor(encode(text,stoi),dtype=torch.long)

# split train and test
train = data[:int(split_ratio*len(data))]
val = data[int(split_ratio*len(data)):]

# get batch of data
x,y = get_batch('train')

torch.Size([50, 128])

# model 

In [32]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        # pe: [seq_lens * 1 * d_model] for each sample

        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self,number_head,head_size):
        super().__init__()
        

In [None]:
class FeedForward(nn.Module):
    def __init__(self,number_head,head_size):
        super().__init__()

In [None]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = d_model // number_head
        self.self_attention = MultiHeadAttention(number_head,head_size)###
        self.norm1  = nn.LayerNorm(d_model)###
        self.ffn = FeedForward()###
        self.norm2  = nn.LayerNorm(d_model)###

    def forward(self,x):
        _x = x
        x = self.self_attention(x) + _x
        
        x = self.norm1(x)

        _x = x
        x = self.ffn(x) + _x

        x = self.norm2(x)
        return x    





In [38]:
# Model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.tok_emb = nn.Embedding(len(stoi),d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        # self.blocks = nn.Sequential(*[Block() for _ in range(num_layer)])
    
    def forward(self, x,y):

        emb_x = self.tok_emb(x)
        emb_x = self.pos_emb(emb_x) # x,y = emb = [batch size * sequence_l * d_model]
        emb_x = self.dropout1(emb_x)
        

In [40]:
model = Model()
x_emb = model(x,y)


torch.Size([50, 128, 768])