In [None]:
from urllib import request
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
from torch.nn import (
    Linear,
    Embedding,
    ModuleList,
    Module,
    LayerNorm,
    GELU,
    ModuleDict,
    CrossEntropyLoss,
)
import torch
import math
import re
import pandas
import tarfile
import tiktoken
import html
import itertools

In [None]:
class CausalSelfAttention(Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            ),
        )

    def forward(self, x, mask):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))

        mask = mask.view(mask.size(0), 1, 1, mask.size(1))
        att = att.masked_fill(mask == 0, float("-inf"))

        att = torch.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = GELU(approximate="tanh")
        self.c_proj = Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x, mask):
        x = x + self.attn(self.ln_1(x), mask)
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(Module):
    def __init__(
        self,
        block_size=1024,
        vocab_size=50257,
        n_layer=12,
        n_head=12,
        n_embd=768,
    ):
        super().__init__()
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.transformer = ModuleDict(
            dict(
                wte=Embedding(self.vocab_size, self.n_embd),
                wpe=Embedding(self.block_size, self.n_embd),
                h=ModuleList([Block(self) for _ in range(self.n_layer)]),
                ln_f=LayerNorm(self.n_embd),
            )
        )
        self.lm_head = Linear(self.n_embd, self.vocab_size, bias=False)

    def forward(self, idx, mask):
        B, T = idx.size()
        assert T <= self.block_size

        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x, mask)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits

In [None]:
def get_parameters(model):
    parameters = [
        {"params": [], "weight_decay": WEIGHT_DECAY},
        {"params": [], "weight_decay": 0.0},
    ]

    for name, parameter in model.named_parameters():
        if any(i in name for i in NO_WEIGHT_DECAY):
            parameters[1]["params"].append(parameter)
        else:
            parameters[0]["params"].append(parameter)

    return parameters

In [None]:
def predict(text, model, tokenizer):
    while True:
        tokens_and_mask = tokenize(text, tokenizer)
        tokens_and_mask = torch.tensor(tokens_and_mask, device="cuda")
        if tokens_and_mask.size(0) != 1:
            break

        length = tokens_and_mask[0, 1].count_nonzero() - 1
        tokens = tokens_and_mask[0, 0, :length].unsqueeze(0)
        mask = tokens_and_mask[0, 1, :length].unsqueeze(0)

        with torch.no_grad():
            logits = model(tokens, mask)[0, -1, :]

        probabilities = torch.softmax(logits, dim=0)
        probabilities, indices = torch.topk(probabilities, 50)
        index = torch.multinomial(probabilities, 1)
        token = indices[index]
        if token == tokenizer.eot_token:
            break

        text += tokenizer.decode(token.tolist())

    return text

In [None]:
def get_loss(batch, loss_function, calculate_gradient):
    references = batch["tokens"][:, 1:]
    references = references.contiguous().view(-1)

    if calculate_gradient:
        predictions = model(batch["tokens"], batch["mask"])
    else:
        with torch.no_grad():
            predictions = model(batch["tokens"], batch["mask"])

    predictions = predictions[:, :-1, :]
    predictions = predictions.contiguous().view(-1, predictions.size(2))

    return loss_function(predictions, references)


def evaluate_model(model, data_loader, loss_function):
    model.eval()
    data_loader = itertools.islice(data_loader, EVALUATE_STEPS)
    loss = [get_loss(batch, loss_function, False) for batch in data_loader]
    loss = sum(loss) / len(loss)
    model.train()

    return loss.item()


def train_model(model, train, valid, loss_function, optimizer, scheduler):
    def print_information(step):
        steps = len(train)
        train_loss = evaluate_model(model, train, loss_function)
        valid_loss = evaluate_model(model, valid, loss_function)

        a = f"Progress:   {step}/{steps}"
        b = f"Train loss: {train_loss}"
        c = f"Valid loss: {valid_loss}"

        print(a, b, c, sep="\n")

    for step, batch in enumerate(train):
        loss = get_loss(batch, loss_function, True) / ACCUMULATION_STEPS
        loss.backward()

        if step % 100 == 0:
            print_information(step)

        if step % ACCUMULATION_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

In [None]:
def download_files():
    request.urlretrieve(
        "https://zenodo.org/records/3606810/files/pol_0616-1119_labeled.tar.zst",
        "pol_0616-1119_labeled.tar.zst",
    )

    file = tarfile.open("pol_0616-1119_labeled.tar.zst")
    file.extractall()
    file.close()


def parse_post(post):
    post = html.unescape(post)
    post = re.sub(r"<br>", "\n", post)
    while re.search(r"<[^<>]*>", post):
        post = re.sub(r"<[^<>]*>", "", post)

    return post


def collate(batch):
    tokens = torch.tensor([i[0] for i in batch], device="cuda")
    mask = torch.tensor([i[1] for i in batch], device="cuda")

    return {"tokens": tokens, "mask": mask}


def get_data_loaders(tokenizer):
    dataset = pandas.read_json(
        "pol_062016-112019_labeled.ndjson", lines=True, nrows=DATASET_ITEMS
    )

    posts = [post for posts in dataset["posts"] for post in posts]
    posts = [post["com"] for post in posts if "com" in post]
    posts = [parse_post(post) for post in posts]
    posts = [post for post in posts if post]
    posts = [tokens for post in posts for tokens in tokenize(post, tokenizer)]

    cutoff = int(len(posts) * TRAIN_VALID_SPLIT)
    train = posts[:cutoff]
    valid = posts[cutoff:]

    train = DataLoader(train, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True)
    valid = DataLoader(valid, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=False)

    return train, valid

In [None]:
def get_scheduler(data_loader, optimizer):
    linear = LinearLR(optimizer, start_factor=LEARNING_RATE, total_iters=WARMUP_STEPS)
    cosine = CosineAnnealingLR(optimizer, T_max=len(data_loader) - WARMUP_STEPS)

    return SequentialLR(
        optimizer=optimizer,
        schedulers=[linear, cosine],
        milestones=[WARMUP_STEPS],
    )

In [None]:
def tokenize(text, tokenizer):
    padded_tokens = []

    tokens = tokenizer.encode(text) + [tokenizer.eot_token]
    tokens_list = [
        tokens[i : i + CONTEXT_LENGTH] for i in range(0, len(tokens), CONTEXT_LENGTH)
    ]

    for tokens in tokens_list:
        mask = [1] * len(tokens)
        if len(tokens) != CONTEXT_LENGTH:
            i = CONTEXT_LENGTH - len(tokens)
            tokens += [PADDING_TOKEN] * i
            mask += [0] * i

        padded_tokens.append((tokens, mask))

    return padded_tokens

In [None]:
ACCUMULATION_STEPS = 32
BATCH_SIZE = 8
CONTEXT_LENGTH = 128
DATASET_ITEMS = 10
EVALUATE_STEPS = 10
LEARNING_RATE = 5e-4
NO_WEIGHT_DECAY = ["wpe.weight", "bias", "ln"]
PADDING_TOKEN = 43000
TRAIN_VALID_SPLIT = 0.95
WARMUP_STEPS = 1000
WEIGHT_DECAY = 0.01

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt2")
train, valid = get_data_loaders(tokenizer)
model = GPT(block_size=CONTEXT_LENGTH, vocab_size=tokenizer.n_vocab).to("cuda")
optimizer = AdamW(params=get_parameters(model), lr=LEARNING_RATE)
scheduler = get_scheduler(train, optimizer)
loss_function = CrossEntropyLoss(reduction="mean", ignore_index=PADDING_TOKEN)

In [None]:
%%time
train_model(model, train, valid, loss_function, optimizer, scheduler)

In [None]:
predict("There", model, tokenizer)