In [4]:
import os
import math
import torch
import string
import random
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Global variables: Essentially the control panel for the notebook
DATASET_PATH = "data/jre/#1422.txt"  # should be a path to a txt file or a directory
MAX_SEQ_LEN = 5
BATCH_SIZE = 64
EPOCHS = 100


def preprocess_dataset(dataset_path):
    # check if dataset is a directory or a file
    # if it is a directory, then we will concatenate all the files in the directory into one file
    if dataset_path.endswith(".txt"):
        return dataset_path
    else:
        for file in os.listdir(dataset_path):
            with open(f"{dataset_path}/{file}", "r") as f:
                with open(f"{dataset_path}/dataset.txt", "a") as f2:
                    f2.write("\n" + f.read())
        return f"{dataset_path}/dataset.txt"


def init_charachter_based_vocab():
    # Instantiate the vocabulary
    with open(DATASET_PATH, "r") as f:
        vocab = sorted(set(f.read().replace("\n", "").lower()))
    return vocab


def init_word_based_vocab():
    # Instantiate the vocabulary
    with open(DATASET_PATH, "r") as f:
        vocab = sorted(
            set(
                f.read()
                .replace("\n", " ")
                .translate(str.maketrans("", "", string.punctuation))
                .lower()
                .split(" ")
            )
        )
    return vocab


DATASET_PATH = preprocess_dataset(DATASET_PATH)
VOCAB = init_charachter_based_vocab()
VOCAB += ["<PAD>"]
VOCAB += ["<START>"]
VOCAB += ["<END>"]
VOCAB_SIZE = len(VOCAB)


def decode_character_sequence(sequence: list):
    return "".join([VOCAB[i] for i in sequence])


def decode_word_sequence(sequence: list):
    return " ".join([VOCAB[i] for i in sequence])


def encode_sequence(sequence: list):
    return [VOCAB.index(c) for c in sequence]


# Logs
print(f"Using dataset: {DATASET_PATH}")
print("Vocabulary size is:", VOCAB_SIZE)
print("Vocabulary is:", VOCAB)


Using dataset: data/jre/#1422.txt
Vocabulary size is: 49
Vocabulary is: [' ', '$', '%', '&', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '<PAD>', '<START>', '<END>']


In [6]:
"""
Data
Inputs: are arbitrary sequences of length MAX_SEQ_LEN samples from the dataset
Targets: are the same sequences shifted by one character to the right ie: one character in the future
"""


class UnsupervisedCharacterDataset(Dataset):
    def __init__(self, dataset_path: str, seq_len: int):
        super().__init__()
        self.seq_len = seq_len
        with open(dataset_path, "r") as f:
            self.data = f.read().replace("\n", " ").lower()

    def __len__(self):
        """
        Total possible number of samples is every
        sequence length subset possible in the dataset
        """
        return len(self.data) - self.seq_len - 1

    def __getitem__(self, i: int):
        """
        Returns a randomly samples sequence of length seq_len (input)
        and then a target sequence which is the same sequence but shifted
        so it is one token ahead of the input sequence
        The model therefore learns to predict the next character
        """
        x = self.data[i : self.seq_len + i]
        y = self.data[i + 1 : self.seq_len + i + 1]
        # Tokenize
        x = encode_sequence([i for i in x])
        y = encode_sequence([i for i in y])
        return {
            "inputs": torch.tensor(x),
            "targets": torch.tensor(y),
        }


class UnsupervisedWordDataset(Dataset):
    def __init__(self, dataset_path: str, seq_len: int):
        super().__init__()
        self.seq_len = seq_len
        with open(dataset_path, "r") as f:
            self.data = (
                f.read()
                .replace("\n", " ")
                .translate(str.maketrans("", "", string.punctuation))
                .lower()
                .split(" ")
            )

    def __len__(self):
        """
        Total possible number of samples is every
        sequence length subset possible in the dataset
        """
        return len(self.data) - self.seq_len - 1

    def __getitem__(self, i: int):
        """
        Returns a randomly samples sequence of length seq_len (input)
        and then a target sequence which is the same sequence but shifted
        so it is one token ahead of the input sequence
        The model therefore learns to predict the next character
        """
        x = self.data[i : self.seq_len + i]
        y = self.data[i + 1 : self.seq_len + i + 1]
        # Tokenize
        x = encode_sequence(x)
        y = encode_sequence(y)
        return {
            "inputs": torch.tensor(x),
            "targets": torch.tensor(y),
        }


def load_dataloader(
    dataset_path: str, seq_len: int, batch_size: int, character_based: bool = True
):
    if character_based:
        dataset = UnsupervisedCharacterDataset(dataset_path, seq_len)
    else:
        dataset = UnsupervisedWordDataset(dataset_path, seq_len)
    return DataLoader(dataset, batch_size=batch_size)


def train_test_split(dataset_path: str, train_size: float):
    assert train_size <= 1.0, "Train size cannot be larger than 1"
    with open(dataset_path, "r") as f:
        dataset = f.read()
    train = dataset[: int(train_size * len(dataset))]
    test = dataset[int(train_size * len(dataset)) :]
    dataset_filename = dataset_path.split("/")[-1]
    dataset_root_path = "/".join(dataset_path.split("/")[:-1])
    with open(f"{dataset_root_path}/train_{dataset_filename}", "w") as f:
        f.write(train)
    with open(f"{dataset_root_path}/test_{dataset_filename}", "w") as f:
        f.write(test)
    print(
        f"Dataset split and written to: {dataset_root_path}/test_{dataset_filename} & {dataset_root_path}/train_{dataset_filename}"
    )
    return (
        f"{dataset_root_path}/train_{dataset_filename}",
        f"{dataset_root_path}/test_{dataset_filename}",
    )


def test_dataloader():
    train, test = train_test_split(DATASET_PATH, 0.9)
    train_dataset = load_dataloader(train, MAX_SEQ_LEN, 1)
    test_dataset = load_dataloader(test, MAX_SEQ_LEN, 1)
    batch = next(iter(train_dataset))
    x, y = batch["inputs"], batch["targets"]
    assert torch.all(
        x[:, 1:].eq(y[:, :-1])
    ), "Target values are not right shifted by one token!"
    print("Dataset is correct!")


In [37]:
"""
Model Transformer architecture: https://arxiv.org/abs/1706.03762
"""


class Seq2SeqTransformer(nn.Module):
    def __init__(self, n_heads, vocab_size, emb_dim, n_blocks) -> None:
        super().__init__()
        self.text_embedding = nn.Embedding(vocab_size, emb_dim)
        self.positional_embedding = nn.Embedding(vocab_size, emb_dim)

        self.encoder = nn.Sequential(
            *[EncoderBlock(emb_dim, n_heads) for i in range(n_blocks)]
        )
        self.decoder = nn.ModuleList(
            [DecoderBlock(emb_dim, n_heads) for i in range(n_blocks)]
        )
        self.out_fc = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim, vocab_size),
        )

    def forward(self, encoder_input, decoder_input):
        enc_x = self.text_embedding(encoder_input)
        dec_x = self.text_embedding(decoder_input)
        enc_x = enc_x + self.positional_embedding(encoder_input)
        dec_x = dec_x + self.positional_embedding(decoder_input)
        encoder_output = self.encoder(enc_x)
        for block in self.decoder:
            dec_x = block(encoder_output, dec_x)
        dec_x = self.out_fc(dec_x)
        return dec_x

    def generate(self, prompt: str, max_len: int):
        """
        Used for inference
        """
        i = 0
        device = "cuda" if torch.cuda.is_available() else "cpu"
        prompt = torch.tensor([[encode_sequence(prompt)]]).to(device)
        while i < max_len:
            generated_output = self.forward(prompt)
            # Select the most probable token
            prompt = torch.cat(
                (prompt, generated_output.argmax(dim=-1).unsqueeze(0)), dim=1
            )
            i += 1
        return prompt


class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads: int, in_d: int, out_d: int) -> None:
        super().__init__()
        self.n_heads = n_heads

        # Parralelise the attention heads
        self.W_q = nn.Linear(in_d, in_d * n_heads, bias=False)
        self.W_k = nn.Linear(in_d, in_d * n_heads, bias=False)
        self.W_v = nn.Linear(in_d, in_d * n_heads, bias=False)
        self.fc_out = nn.Linear(in_d * n_heads, out_d, bias=True)

    def softmax_attention(self, q, k, v, mask=None):
        # Caculate dot product
        qk = q @ k.transpose(-2, -1)
        # Normalise by the square root of the dimension
        qk = qk / math.sqrt(q.shape[-1])
        # Apply mask if decoder
        if mask is not None:
            qk = qk.masked_fill(mask == 0, float("-inf"))
        # Calculate attention scores
        qk = F.softmax(qk, dim=-1)
        # Apply attention scores to values
        out = qk @ v
        return out

    def split_heads(self, x):
        # [batch_size, seq_len, emb_dim] -> [batch_size, n_heads, seq_len, emb_dim/n_heads]
        batch_size = x.shape[0]
        return (
            x.reshape(batch_size, -1, self.n_heads, x.shape[-1] // self.n_heads)
            .permute(0, 2, 1, 3)
            .reshape(batch_size * self.n_heads, -1, x.shape[-1] // self.n_heads)
        )

    def concat_heads(self, x):
        # [batch_size * n_heads, seq_len, emb_dim/n_heads] -> [batch_size, seq_len, emb_dim*heads]
        batch_size = x.shape[0] // self.n_heads
        return (
            x.reshape(batch_size, self.n_heads, -1, x.shape[-1])
            .permute(0, 2, 1, 3)
            .reshape(batch_size, -1, x.shape[-1] * self.n_heads)
        )

    def forward(self, q, k, v, mask=None):
        # Embed the input [batch_size, seq_len, emb_dim] -> [batch_size, seq_len, n_heads, emb_dim/n_heads]
        q = self.split_heads(self.W_q(q))
        k = self.split_heads(self.W_k(k))
        v = self.split_heads(self.W_v(v))
        # Calculate attention
        attention_score = self.softmax_attention(q, k, v, mask)
        # Concatenate the heads
        out = self.concat_heads(attention_score)
        # Apply final linear layer
        out = self.fc_out(out)
        return out


class FeedForwardBlock(nn.Module):
    def __init__(self, in_d, out_d) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_d, out_d),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.net(x)


class EncoderBlock(nn.Module):
    def __init__(self, in_d, n_heads) -> None:
        super().__init__()
        self.mha = MultiHeadedAttention(n_heads, in_d, in_d)
        # Note: FF block is sandwhiched between layer norms as res conenction occurs here
        self.ff = FeedForwardBlock(in_d, in_d)
        self.norm_1 = nn.LayerNorm(in_d)
        self.norm_2 = nn.LayerNorm(in_d)

    def forward(self, x):
        res = x
        x = self.mha(x, x, x)
        x = self.norm_1(x + res)
        res = x
        x = self.ff(x)
        x = self.norm_2(x + res)
        return x


class DecoderBlock(nn.Module):
    def __init__(self, in_d, n_heads) -> None:
        super().__init__()
        # Decoder only
        self.masked_mha = MultiHeadedAttention(n_heads, in_d, in_d)
        # For encoder and decoder
        self.cross_mha = MultiHeadedAttention(n_heads, in_d, in_d)
        # Note: FF block is sandwhiched between layer norms as res conenction occurs here
        self.ff = FeedForwardBlock(in_d, in_d)
        self.norm_1 = nn.LayerNorm(in_d)
        self.norm_2 = nn.LayerNorm(in_d)

    def make_mask(self, dec_x):
        # Create a mask for the decoder to prevent it from looking ahead
        mask = torch.tril(torch.ones(dec_x.shape[1], dec_x.shape[1]), diagonal=1)
        mask = mask.masked_fill(mask == 1, float("-inf"))
        return mask.unsqueeze(0).to(dec_x.device)

    def forward(self, enc_x, dec_x):
        # Layer 1
        res = dec_x
        mask = self.make_mask(dec_x)
        dec_x = self.masked_mha(dec_x, dec_x, dec_x, mask)
        dec_x = self.norm_1(dec_x + res)
        # Layer 2
        res = dec_x
        x = self.cross_mha(dec_x, enc_x, enc_x)
        x = self.norm_1(x + res)
        # Layer 3
        res = x
        x = self.ff(x)
        x = self.norm_2(x + res)
        return x


In [38]:
"""
Model Transformer architecture: https://arxiv.org/abs/1706.03762
"""


class GPTDecoderTransformer(nn.Module):
    """
    This is a decoder only transformer for next token prediction
    """

    def __init__(self, n_heads, vocab_size, emb_dim, n_blocks) -> None:
        super().__init__()
        self.text_embedding = nn.Embedding(vocab_size, emb_dim)
        self.positional_embedding = nn.Embedding(vocab_size, emb_dim)
        self.decoder = nn.ModuleList(
            [GPTDecoderBlock(emb_dim, n_heads) for i in range(n_blocks)]
        )
        self.out_fc = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim, vocab_size),
        )

    def forward(self, x):
        x_emb = self.text_embedding(x)
        x = x_emb + self.positional_embedding(x)
        for block in self.decoder:
            x = block(x)
        x = self.out_fc(x)
        return x

    def generate(self, prompt: str, max_len: int):
        """
        Used for inference
        """
        device = "cuda" if torch.cuda.is_available() else "cpu"
        prompt = torch.tensor([encode_sequence(prompt)]).to(device)
        while prompt.shape[-1] < max_len:
            generated_output = self.forward(prompt)
            # Select the most probable token
            output = F.softmax(generated_output[:,-1,:], dim=-1)
            next_token = torch.multinomial(output, num_samples=1)
            prompt = torch.cat((prompt, next_token), dim=1)
        output = prompt
        return output


class GPTDecoderBlock(nn.Module):
    def __init__(self, in_d, n_heads) -> None:
        super().__init__()
        # Decoder only
        self.masked_mha = MultiHeadedAttention(n_heads, in_d, in_d)
        # For encoder and decoder
        self.cross_mha = MultiHeadedAttention(n_heads, in_d, in_d)
        # Note: FF block is sandwhiched between layer norms as res conenction occurs here
        self.ff = FeedForwardBlock(in_d, in_d)
        self.norm_1 = nn.LayerNorm(in_d)
        self.norm_2 = nn.LayerNorm(in_d)

    def make_mask(self, dec_x):
        # Create a mask for the decoder to prevent it from looking ahead
        mask = torch.tril(torch.ones(dec_x.shape[1], dec_x.shape[1]))
        return mask.unsqueeze(0).to(dec_x.device)

    def forward(self, x):
        # Layer 1
        res = x
        mask = self.make_mask(x)
        x = self.masked_mha(x, x, x, mask)
        x = self.norm_1(x + res)
        # Layer 2
        res = x
        x = self.ff(x)
        x = self.norm_2(x + res)
        return x

In [39]:
class Trainer:
    def __init__(
        self,
        model: nn.Module,
        train_dl: DataLoader,
        val_dl: DataLoader,
        loss_fn: nn.Module,
        optimizer: torch.optim.Optimizer,
        character_based: bool = False,
    ) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.model.to(self.device)
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.character_based = character_based

    def run(self, epochs: int):
        self.model.train()
        print(f"Starting training for {epochs} epochs...")
        for epoch in range(epochs):
            train_loss = self.train_step()
            val_loss, output = self.val_step()
        # Save model
        torch.save(self.model.state_dict(), "model.pt")

    def train_step(self):
        """
        Performs one pass over the training data
        """
        train_loss = 0
        self.model.train()
        for batch in tqdm(self.train_dl, desc=f"Train Step", leave=False):
            x, y = batch["inputs"], batch["targets"]
            x, y = x.to(self.device), y.to(self.device)
            self.optimizer.zero_grad()
            out = self.model(x)
            loss: torch.Tensor = self.loss_fn(out.permute(0, 2, 1), y)
            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()
            train_loss /= 2
        return train_loss

    def val_step(self):
        """
        Performs one pass over the validation set
        """
        val_loss = 0
        self.model.eval()
        with torch.no_grad():
            for batch in tqdm(self.val_dl, desc="Val Step", leave=False):
                x, y = batch["inputs"], batch["targets"]
                x, y = x.to(self.device), y.to(self.device)
                out = self.model(x)
                loss: torch.Tensor = self.loss_fn(out.permute(0, 2, 1), y)
                val_loss += loss.item()
                return val_loss, out
            val_loss /= len(self.val_dl)


In [40]:
model = GPTDecoderTransformer(n_heads=4, vocab_size=len(VOCAB), emb_dim=256, n_blocks=6)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
train_path, val_path = train_test_split(DATASET_PATH, 0.99)
train_dl, val_dl = load_dataloader(
    "/home/j/Desktop/Programming/DeepLearning/picoGPT/data/jre/dataset.txt",
    128,
    8,
    character_based=True,
), load_dataloader(
    "/home/j/Desktop/Programming/DeepLearning/picoGPT/data/jre/dataset.txt",
    128,
    8,
    character_based=True,
)
Trainer(model, train_dl, val_dl, loss_fn, optimizer, True).run(epochs=100)


Dataset split and written to: data/jre/test_#1422.txt & data/jre/train_#1422.txt
Starting training for 100 epochs...
Epoch: 0


                                                           

Epoch: 1


                                                           

Epoch: 2


                                                           

Epoch: 3


                                                           

Epoch: 4


                                                           

Epoch: 5


                                                           

Epoch: 6


                                                           

Epoch: 7


                                                           

Epoch: 8


                                                           

Epoch: 9


                                                           

Epoch: 10


                                                           

Epoch: 11


                                                           

Epoch: 12


                                                           

Epoch: 13


                                                           

Epoch: 14


                                                           

Epoch: 15


                                                           

Epoch: 16


                                                           

Epoch: 17


                                                           

Epoch: 18


                                                           

Epoch: 19


                                                           

Epoch: 20


                                                           

Epoch: 21


                                                           

Epoch: 22


                                                           

Epoch: 23


                                                           

Epoch: 24


                                                           

Epoch: 25


                                                           

Epoch: 26


                                                           

Epoch: 27


                                                           

Epoch: 28


                                                           

Epoch: 29


                                                           

Epoch: 30


                                                           

Epoch: 31


                                                           

Epoch: 32


                                                           

Epoch: 33


                                                           

Epoch: 34


                                                           

Epoch: 35


                                                           

Epoch: 36


                                                           

Epoch: 37


                                                           

Epoch: 38


                                                           

Epoch: 39


                                                           

Epoch: 40


                                                           

Epoch: 41


                                                           

Epoch: 42


                                                           

Epoch: 43


                                                           

Epoch: 44


                                                           

Epoch: 45


                                                           

Epoch: 46


                                                           

Epoch: 47


                                                           

Epoch: 48


                                                           

Epoch: 49


                                                           

Epoch: 50


                                                           

Epoch: 51


                                                           

Epoch: 52


                                                           

Epoch: 53


                                                           

Epoch: 54


                                                           

Epoch: 55


                                                           

Epoch: 56


                                                           

Epoch: 57


                                                           

Epoch: 58


Train Step:   0%|          | 0/22 [00:00<?, ?it/s]

In [20]:
class Predictor:
    def __init__(self, model: nn.Module, pth_path: str) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.model.to(self.device)
        self.model.load_state_dict(torch.load(pth_path))

    def predict(self, prompt: str):
        output = self.model.generate(prompt, 1024)
        print(decode_character_sequence(output[0].tolist()))

    def launch_interactive(self):
        while True:
            prompt = input("Enter a prompt: ")
            print(self.predict(prompt))


model = GPTDecoderTransformer(n_heads=4, vocab_size=len(VOCAB), emb_dim=256, n_blocks=6)
predictor = Predictor(model, "model.pt")
predictor.launch_interactive()


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0