In [108]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Loading and Preprocessing

In this section
-  I will load and tokenize text
- Create a vocabulary
- Encode Text
- Create Training Sequences


In [90]:
with open("input.txt") as f:
    data = [line.strip() for line in f if line.strip()]
    data = " ".join(data)

# TESTING WITH SMALLER DATASET
data = data[:2000]

In [91]:
class Tokenizer:
    def __init__(self, data):
        self.data = data
        self.vocab = sorted(set(data))
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)

    def encode(self, data):
        return torch.tensor([self.char2idx[char] for char in data])

    def decode(self, data):
        data = data.tolist()
        return "".join([self.idx2char[idx] for idx in data])

In [92]:
class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        return (self.data[idx:idx + self.seq_length], self.data[idx + 1:idx + self.seq_length + 1])

In [93]:
tokenizer = Tokenizer(data)
data = tokenizer.encode(data)

In [94]:
dataset = TextDataset(data, 5)
dataloader = DataLoader(dataset, batch_size=20, shuffle=True)

In [95]:
# some logic to see if the dataloader is working
# for i in range(5):
#     for x, y in dataloader:
#         print(x[i], y[i])
#         break
#     print()

# Setting up the Model

In [96]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
    

In [104]:
vocab_size = tokenizer.vocab_size

# setting up params per the nn.Transformer docs
d_model = 108
nhead = 12
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
dropout = 0.1
activation = "relu"



In [105]:
embedding = nn.Embedding(vocab_size, d_model)

In [110]:
transformer = nn.Transformer(d_model=d_model, 
                             nhead=nhead, 
                             num_encoder_layers=num_encoder_layers, 
                             num_decoder_layers=num_decoder_layers, 
                             dim_feedforward=dim_feedforward, 
                             dropout=dropout, 
                             activation=activation,
                             batch_first=True)

## Training Loop