In [None]:
from accelerate import Accelerator
from urllib import request
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
from torch.nn import (
    Linear,
    Embedding,
    ModuleList,
    Module,
    LayerNorm,
    GELU,
    ModuleDict,
    CrossEntropyLoss,
)
import torch
import math
import re
import pandas
import tarfile
import tiktoken
import html

In [None]:
class CausalSelfAttention(Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            ),
        )

    def forward(self, x, mask):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))

        mask = mask.view(mask.size(0), 1, 1, mask.size(1))
        att = att.masked_fill(mask == 0, float("-inf"))

        att = torch.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = GELU(approximate="tanh")
        self.c_proj = Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x, mask):
        x = x + self.attn(self.ln_1(x), mask)
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(Module):
    def __init__(
        self,
        block_size=1024,
        vocab_size=50257,
        n_layer=12,
        n_head=12,
        n_embd=768,
    ):
        super().__init__()
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.transformer = ModuleDict(
            dict(
                wte=Embedding(self.vocab_size, self.n_embd),
                wpe=Embedding(self.block_size, self.n_embd),
                h=ModuleList([Block(self) for _ in range(self.n_layer)]),
                ln_f=LayerNorm(self.n_embd),
            )
        )
        self.lm_head = Linear(self.n_embd, self.vocab_size, bias=False)

    def forward(self, idx, mask):
        B, T = idx.size()
        assert T <= self.block_size

        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x, mask)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits

In [None]:
def get_parameters(model):
    parameters = [
        {"params": [], "weight_decay": WEIGHT_DECAY},
        {"params": [], "weight_decay": 0.0},
    ]

    for name, parameter in model.named_parameters():
        if any(i in name for i in NO_WEIGHT_DECAY):
            parameters[1]["params"].append(parameter)
        else:
            parameters[0]["params"].append(parameter)

    return parameters

In [None]:
def predict(text, model, tokenizer):
    while True:
        tokens_and_masks = torch.tensor(tokenize(text), device="cuda")
        if tokens_and_masks.shape[0] != 1:
            break

        length = tokens_and_masks[0, 1].count_nonzero() - 1
        tokens = tokens_and_masks[0, 0, :length].unsqueeze(0)
        masks = tokens_and_masks[0, 1, :length].unsqueeze(0)

        with torch.no_grad():
            logits = model(tokens, masks)[0, -1, :]

        probabilities = torch.softmax(logits, dim=0)
        probabilities, indices = torch.topk(probabilities, 50)
        index = torch.multinomial(probabilities, 1)
        token = indices[index]
        if token == tokenizer.eot_token:
            break

        text += tokenizer.decode(token.tolist())

    return text

In [None]:
def evaluate(model, eval_dataloader, loss_fn, accelerator):
    model.eval()
    losses = []

    for batch, mask in eval_dataloader:
        references = batch[..., 1:].contiguous().view(-1)
        with torch.no_grad():
            predictions = model(batch, mask)[..., :-1, :].contiguous()
            predictions = predictions.view(-1, predictions.size(-1))

        loss = loss_fn(predictions, references)
        loss = accelerator.gather(loss)
        losses.append(loss)

    model.train()
    loss = sum(losses) / len(losses)
    return loss.item()


def eval(step, model, eval_dataloader, train_dataloader, loss_fn, accelerator):
    t_loss = 0 # evaluate(model, train_dataloader, loss_fn, accelerator)
    e_loss = evaluate(model, eval_dataloader, loss_fn, accelerator)
    steps = len(train_dataloader)

    s1 = f"Progress:        {step}/{steps}"
    s2 = f"Training loss:   {t_loss}"
    s3 = f"Validation loss: {e_loss}"

    print(s1, s2, s3, sep="\n")


def train(
    train_dataloader,
    eval_dataloader,
    model,
    loss_fn,
    accelerator,
    optimizer,
    lr_scheduler,
):
    eval(0, model, eval_dataloader, train_dataloader, loss_fn, accelerator)

    for step, (batch, mask) in enumerate(train_dataloader, start=1):
        predictions = model(batch, mask)[..., :-1, :].contiguous()
        predictions = predictions.view(-1, predictions.size(-1))
        references = batch[..., 1:].contiguous().view(-1)

        loss = loss_fn(predictions, references) / ACC
        accelerator.backward(loss)

        if step % ACC == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if step % INFO_N == 0:
            eval(step, model, eval_dataloader, train_dataloader, loss_fn, accelerator)

    eval(len(train_dataloader), model, eval_dataloader, train_dataloader, loss_fn, accelerator)

In [None]:
class PandasDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df[idx]
        tokens = torch.tensor(item[0], dtype=torch.long)
        mask = torch.tensor(item[1], dtype=torch.long)
        return tokens, mask


def parse(string):
    string = html.unescape(string)
    string = re.sub(r"<br>", "\n", string)

    while re.search(r"<[^<>]*>", string):
        string = re.sub(r"<[^<>]*>", "", string)

    return string


def download_files():
    request.urlretrieve(
        "https://zenodo.org/records/3606810/files/pol_0616-1119_labeled.tar.zst",
        "pol_0616-1119_labeled.tar.zst",
    )
    file = tarfile.open("pol_0616-1119_labeled.tar.zst")
    file.extractall()
    file.close()


def get_dataloader():
    df = pandas.read_json("pol_062016-112019_labeled.ndjson", lines=True, nrows=ROWS)
    df = [post["com"] for posts in df["posts"] for post in posts if "com" in post]
    df = [parse(post) for post in df]
    df = [post for post in df if len(post) != 0]
    df = [bbb for post in df for bbb in tokenize(post)]

    cutoff = int(len(df) * SPLIT)
    train = df[:cutoff]
    valid = df[cutoff:]

    train = PandasDataset(train)
    valid = PandasDataset(valid)

    train = DataLoader(train, batch_size=BS, shuffle=True)
    valid = DataLoader(valid, batch_size=BS, shuffle=False)

    return train, valid

In [None]:
def tokenize(text):
    al = []

    tokens = tokenizer.encode(text) + [tokenizer.eot_token]
    tokens_sublists = [
        tokens[i : i + CONTEXT_LENGTH] for i in range(0, len(tokens), CONTEXT_LENGTH)
    ]

    for tokens_sublist in tokens_sublists:
        bbb = []
        if len(tokens_sublist) == CONTEXT_LENGTH:
            a = tokens_sublist
            b = [1] * CONTEXT_LENGTH
        else:
            m = CONTEXT_LENGTH - len(tokens_sublist)

            a = tokens_sublist + [43000] * m
            b = [1] * len(tokens_sublist) + [0] * m

        al.append([a, b])
    return al

In [None]:
CONTEXT_LENGTH = 128
ACC = 32
BS = 8
LR = 5e-4
WARMUP_STEPS = 1000
WEIGHT_DECAY = 0.01
SPLIT = 0.95
INFO_N = 400
ROWS = 100

NO_WEIGHT_DECAY = ["wpe.weight", "bias", "ln"]

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt2")

train_dl, valid_dl = get_dataloader()

model = GPT(block_size=CONTEXT_LENGTH, vocab_size=tokenizer.n_vocab)

params = get_parameters(model)

optimizer = AdamW(params=params, lr=LR)

accelerator = Accelerator(mixed_precision="fp16")

model, optimizer, train_dl, valid_dl = accelerator.prepare(
    model, optimizer, train_dl, valid_dl
)

warmup = LinearLR(optimizer, start_factor=LR, total_iters=WARMUP_STEPS)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_dl) - WARMUP_STEPS)
lr_scheduler = SequentialLR(
    optimizer,
    schedulers=[warmup, scheduler],
    milestones=[WARMUP_STEPS],
)

loss_fn = CrossEntropyLoss(reduction="mean", ignore_index=43000)

In [None]:
%%time
train(train_dl, valid_dl, model, loss_fn, accelerator, optimizer, lr_scheduler)

In [None]:
predict("There", model, tokenizer)