In [1]:
from torch.utils.data import Dataset,DataLoader
import torch
import re

# dataloader

In [3]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):
        chars = sorted(list(set(data)))  # get characters from the input data
        self.stoi = {ch: i for i, ch in enumerate(chars)}  # map characters to integer indices
        self.itos = {i: ch for i, ch in enumerate(chars)}

        self.block_size = config.block_size
        self.data = data

    def get_vocab_size(self):
        return len(self.stoi)

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        idx_chunk = [self.stoi[c] for c in chunk]
        x = torch.tensor(idx_chunk[:-1], dtype=torch.long)
        # return the chunk and the shifted version as tensors
        y = torch.tensor(idx_chunk[1:], dtype=torch.long)

        return x, y



# test preprocessing

In [6]:
class config:
    block_size = 128 #128
    batch_size = 128 #128
    char_emb_dim = 768
    num_head = 8
    
def read_data(): 
    text = open('data.txt', 'r').read()
    return re.sub('[^A-Za-z:]+',' ',text).strip().lower()

data = read_data()
config = config()

In [5]:
char_dataset = CharDataset(config, data)
# Create a DataLoader
data_loader = DataLoader(char_dataset, batch_size=config.block_size,shuffle=True)

# Iterate over batches
for ind,(x,y) in enumerate(data_loader):
    print(x.type)
    print(y.shape)
    print("===")

<built-in method type of Tensor object at 0x0000027F5723B890>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034D9DB0>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034DA850>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034DABC0>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034DAC60>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034DADF0>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F7C366670>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034D8870>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034DAE90>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F034D98B0>
torch.Size([128, 128])
===
<built-in method type of Tensor object at 0x0000027F7C37CAF0>
torch.Size([128, 128])
===
<built-in method type

In [24]:
data_loader


<torch.utils.data.dataloader.DataLoader at 0x1eb99283ad0>

# Decoder architecture

In [None]:
d_model = config.char_emb_dim
num_head = config.num_head
drop_prob = 0.1
ffn_hidden = 2048
num_layer = 12 

In [None]:
# prepare output
tok_emb = WTE(y) # token embeddings
pos_emb = WPE(pos) # position embeddings
x = Dropout(tok_emb + pos_emb)