In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Loading and Preprocessing

In this section
-  I will load and tokenize text
- Create a vocabulary
- Encode Text
- Create Training Sequences


In [72]:
with open("input.txt") as f:
    data = [line.strip() for line in f if line.strip()]
    data = " ".join(data)

# TESTING WITH SMALLER DATASET
data = data[:100]

In [73]:
class Tokenizer:
    def __init__(self, data):
        self.data = data
        self.vocab = sorted(set(data))
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)

    def encode(self, data):
        return torch.tensor([self.char2idx[char] for char in data])

    def decode(self, data):
        data = data.tolist()
        return "".join([self.idx2char[idx] for idx in data])

In [74]:
class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        return (self.data[idx:idx + self.seq_length], self.data[idx + 1:idx + self.seq_length + 1])

In [75]:
tokenizer = Tokenizer(data)
data = tokenizer.encode(data)

In [76]:
dataset = TextDataset(data, 5)
dataloader = DataLoader(dataset, batch_size=20, shuffle=True)

In [77]:
# some logic to see if the dataloader is working
# for i in range(5):
#     for x, y in dataloader:
#         print(x[i], y[i])
#         break
#     print()

# Setting up the Model

In [80]:
# input_size = tokenizer.vocab_size
# output_size = tokenizer.vocab_size

# setting up params per the nn.Transformer docs
d_model = 100
nhead = 12
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
dropout = 0.1
activation = "relu"



In [81]:
class Transformer(nn.Module):
    def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, activation):
        super(Transformer, self).__init__()
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation)
        self.linear = nn.Linear(d_model, tokenizer.vocab_size)

    def forward(self, src, tgt):
        output = self.transformer(src, tgt)
        output = self.linear(output)
        return output

I need to take the time to understand the nn.Transformer class

In [83]:
transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
src = torch.rand((10, 32, 512))
tgt = torch.rand((20, 32, 512))
out = transformer_model(src, tgt)

In [85]:
out.shape

torch.Size([20, 32, 512])