In [46]:
import os

import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from lightning.pytorch.callbacks import ModelCheckpoint
from rich import print
from sklearn.datasets import fetch_20newsgroups
from torch.utils.data import DataLoader
from transformers import GPT2Config, GPT2Model, GPT2Tokenizer

%load_ext rich


The rich extension is already loaded. To reload it, use:
  %reload_ext rich


## Prepare dataset

### Load dataset

In [47]:

path = os.path.join(os.getcwd(),"data")
# path = "/kaggle/input/20newsgroups/"
train = fetch_20newsgroups(data_home=path, subset="train")
test = fetch_20newsgroups(data_home=path, subset="test")


In [48]:
# Configuration parameters for model training
config = {
    "BATCH_SIZE": 16,
    "MAX_LENGTH": 512,
    "LEARNING_RATE": 1e-5,
    "N_EMBED": 768,
    "N_HEADS": 2,
    "N_BLOCKS": 12,
    "DROPOUT": 0.2,
    "NUM_LABELS": 20
}

# device = "cuda"
device = "mps" if torch.backends.mps.is_available() else "cpu"


### Create Tokenizer and Data Transformation Method

In [49]:
# Initialize the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Set the pad token to the end of sentence token
tokenizer.pad_token = tokenizer.eos_token
# Add the pad token to the special tokens
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})


# Define the Dataset class for the transformer model
class Dataset(torch.utils.data.Dataset):
    # Initialize the Dataset class
    def __init__(self, data, tokenizer, max_length, target):
        # Set the data, tokenizer, max_length, texts, and labels attributes
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = [text for text in data]  # Corrected attribute name from self.text to self.texts
        self.labels = target

    # Define the length method for the Dataset class
    def __len__(self):
        return len(self.data)

    # Define the classes method for the Dataset class
    def classes(self):
        return self.labels

    # Define the getitem method for the Dataset class
    def __getitem__(self, index):
        # Tokenize the text at the given index
        text_batch = tokenizer(
            self.texts[index],
            padding="max_length",
            max_length=config["MAX_LENGTH"],
            truncation=True,
            return_tensors="pt",
        )
        # Get the target label at the given index
        target_batch = np.array(self.labels[index])
        # Return the tokenized text and the target label
        return text_batch, target_batch


### Process train test data

In [50]:
# Split the data into training and validation sets
train_idx, val_idx = np.split(
    np.random.permutation(len(train.data)), [int(0.8 * len(train.data))]
)

# Create datasets for training, validation, and testing
# The Dataset class is defined earlier in the code
train_dataset = Dataset([train.data[i] for i in train_idx], tokenizer, config['MAX_LENGTH'], train.target[train_idx])
val_dataset = Dataset([train.data[i] for i in val_idx], tokenizer, config['MAX_LENGTH'], train.target[val_idx])
test_dataset = Dataset(test.data, tokenizer, config['MAX_LENGTH'], test.target)

# Create data loaders for training, validation, and testing
# The DataLoader class is from PyTorch's torch.utils.data module
train_loader = DataLoader(train_dataset, batch_size=config['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['BATCH_SIZE'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=config['BATCH_SIZE'], shuffle=False)


## Creating components of transformer model

In [67]:
# This class defines the attention layer in the transformer model.
class AttentionLayer(nn.Module):
    def __init__(self, head_size, config):
        # Initialize the attention layer
        super().__init__()
        # Define the query, key, and value layers
        self.query_layer = nn.Linear(config['N_EMBED'], head_size, bias=False)
        self.key_layer = nn.Linear(config['N_EMBED'], head_size, bias=False)
        self.value_layer = nn.Linear(config['N_EMBED'], head_size, bias=False)
        # Define the dropout layer
        self.dropout = nn.Dropout(config['DROPOUT'])

    def forward(self, x):
        # Get the batch size, sequence length, and embedding size
        B, T, C = x.shape

        # Compute the query, key, and value matrices
        q = self.query_layer(x)
        k = self.key_layer(x)
        v = self.value_layer(x)

        # Compute the attention weights
        weights = q @ k.transpose(-2, -1) * (C**-0.5)

        # Apply softmax and dropout to the attention weights
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)

        # Compute the output
        out = weights @ v
        return out

# This class defines the multi-head attention layer in the transformer model.


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size, config):
        # Initialize the multi-head attention layer
        super().__init__()
        # Define the attention heads
        self.heads = nn.ModuleList(
            [AttentionLayer(head_size, config) for _ in range(n_heads)])
        # Define the projection layer
        self.proj = nn.Linear(config['N_EMBED'], config['N_EMBED'])
        # Define the dropout layer
        self.dropout = nn.Dropout(config['DROPOUT'])

    def forward(self, x):
        # Compute the output of each attention head
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        # Apply the projection layer
        out = self.proj(out)
        # Apply dropout
        out = self.dropout(out)
        return out

# This class defines the feed forward layer in the transformer model.


class FeedForward(nn.Module):
    def __init__(self, config, scale_factor=1):
        # Initialize the feed forward layer
        super().__init__()
        # Define the feed forward network
        self.net = nn.Sequential(
            nn.Linear(config['N_EMBED'], scale_factor * config['N_EMBED']),
            nn.GELU(),
            # Projection layer
            nn.Linear(scale_factor * config['N_EMBED'], config['N_EMBED']),
            nn.Dropout(config['DROPOUT']),
        )

    def forward(self, x):
        # Compute the output of the feed forward network
        return self.net(x)

# This class defines the transformer block in the transformer model.


class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_heads):
        # Initialize the transformer block
        super().__init__()
        # Define the self-attention layer
        self.sa_heads = MultiHeadAttention(n_heads, n_embed // n_heads, config)
        # Define the layer normalization layer
        self.ln1 = nn.LayerNorm(n_embed)

        # Define the feed forward layer
        self.ffwd = FeedForward(n_embed, 4)
        # Define the layer normalization layer
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        # Compute the output of the self-attention layer
        x = x + self.sa_heads(self.ln1(x))
        # Compute the output of the feed forward layer
        x = x + self.ffwd(self.ln2(x))
        return x

In [55]:
# This class defines the transformer classifier model.
class TransformerClassifier(L.LightningModule):
    def __init__(self, hidden_size, num_classes, max_seq_len, n_heads, n_layers, lr=1e-5):
        # Initialize the transformer classifier model
        super().__init__()

        # Load the GPT2 model configuration and model
        self.gpt2config = GPT2Config.from_pretrained("gpt2", n_layer=n_layers)
        self.gpt2 = GPT2Model.from_pretrained("gpt2", config=self.gpt2config)

        # Initialize the transformer block
        self.tf_block = TransformerBlock(hidden_size, n_heads)
        # Initialize the classifier
        self.classifier = nn.Linear(hidden_size * max_seq_len, num_classes)
        # Set the learning rate
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        # Compute the output of the GPT2 model
        gpt_out = self.gpt2(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=True
        ).last_hidden_state

        # Pass through transformer block
        gpt_out = self.tf_block(gpt_out)

        # Flatten and pass through classifier layer
        logits = self.classifier(gpt_out.view(gpt_out.size(0), -1))
        return logits

    def training_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        # Print a random sample if it's the first batch
        if batch_idx == 0:
            r_idx = np.random.randint(0, len(y))  # Random index

            print(
                f"Input: {tokenizer.decode(input_ids[r_idx], skip_special_tokens=True)}"
            )
            print(f"Label: {y[r_idx]}")
            print(f"Prediction: {logits.argmax(1)[r_idx]}")

        return loss

    def test_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("test_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        # Configure the optimizer
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer



## Create the transformer model

The model is initialzed below. A trainer class was also created from PyTorch Lightning to train the model.

In [68]:
model = TransformerClassifier(
    config['N_EMBED'], config['NUM_LABELS'], config['MAX_LENGTH'], config['N_HEADS'], config['N_BLOCKS'], lr=config['LEARNING_RATE']
)

In [14]:
trainer = L.Trainer(
    accelerator='gpu',
    devices=2,
    max_epochs=5,
    callbacks=[ModelCheckpoint(monitor="val_acc", mode="max")],
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(model, train_loader, val_loader)


  | Name       | Type             | Params
------------------------------------------------
0 | gpt2       | GPT2Model        | 124 M 
1 | tf_block   | TransformerBlock | 7.1 M 
2 | classifier | Linear           | 7.9 M 
------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.559   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

The model training was conducted on Kaggle using GPU, but the model was trained using the same code specified in this notebook. The model weights were saved after trainning, which were then loaded in this notebook to make predictions on the test data.

In [84]:
# Create an instance of the TransformerClassifier model and move it to the device
model = TransformerClassifier(
    config['N_EMBED'], config['NUM_LABELS'], config['MAX_LENGTH'], config['N_HEADS'], config['N_BLOCKS'], lr=config['LEARNING_RATE']
).to(device)

# Load the state dictionary of the model
# state_dict = torch.load("/kaggle/working/model/.ckpt")
state_dict = torch.load("./epoch=4-step=1415.ckpt", map_location=device)
model.load_state_dict(state_dict["state_dict"])

[1m<[0m[1;95mAll[0m[39m keys matched successfully[0m[1m>[0m

In [91]:

# Test the model on the test_loader
trainer.test(model, test_loader)

/Users/yyy/Library/Python/3.9/lib/python/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

/Users/yyy/Library/Python/3.9/lib/python/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:

# Get the next batch of data from the test_loader
X, y = next(iter(test_loader))

# Get the logits for the first input in the batch
logits = model(X["input_ids"][1].to(device), X["attention_mask"][1].to(device))

# Print the decoded input, actual label and predicted label
print(tokenizer.decode(X["input_ids"][1][0], skip_special_tokens=True))
print(f"Actual Label: {y[1]}")
print(f"Predicted Label: {logits.argmax(1).item()}")

In [72]:
model


[1;35mTransformerClassifier[0m[1m([0m
  [1m([0mgpt2[1m)[0m: [1;35mGPT2Model[0m[1m([0m
    [1m([0mwte[1m)[0m: [1;35mEmbedding[0m[1m([0m[1;36m50257[0m, [1;36m768[0m[1m)[0m
    [1m([0mwpe[1m)[0m: [1;35mEmbedding[0m[1m([0m[1;36m1024[0m, [1;36m768[0m[1m)[0m
    [1m([0mdrop[1m)[0m: [1;35mDropout[0m[1m([0m[33mp[0m=[1;36m0[0m[1;36m.1[0m, [33minplace[0m=[3;91mFalse[0m[1m)[0m
    [1m([0mh[1m)[0m: [1;35mModuleList[0m[1m([0m
      [1m([0m[1;36m0[0m-[1;36m11[0m[1m)[0m: [1;36m12[0m x [1;35mGPT2Block[0m[1m([0m
        [1m([0mln_1[1m)[0m: [1;35mLayerNorm[0m[1m([0m[1m([0m[1;36m768[0m,[1m)[0m, [33meps[0m=[1;36m1e[0m[1;36m-05[0m, [33melementwise_affine[0m=[3;92mTrue[0m[1m)[0m
        [1m([0mattn[1m)[0m: [1;35mGPT2Attention[0m[1m([0m
          [1m([0mc_attn[1m)[0m: [1;35mConv1D[0m[1m([0m[1m)[0m
          [1m([0mc_proj[1m)[0m: [1;35mConv1D[0m[1m([0m[1m)[0m
          [1m(

In [82]:
model


[1;35mTransformerClassifier[0m[1m([0m
  [1m([0mgpt2[1m)[0m: [1;35mGPT2Model[0m[1m([0m
    [1m([0mwte[1m)[0m: [1;35mEmbedding[0m[1m([0m[1;36m50257[0m, [1;36m768[0m[1m)[0m
    [1m([0mwpe[1m)[0m: [1;35mEmbedding[0m[1m([0m[1;36m1024[0m, [1;36m768[0m[1m)[0m
    [1m([0mdrop[1m)[0m: [1;35mDropout[0m[1m([0m[33mp[0m=[1;36m0[0m[1;36m.1[0m, [33minplace[0m=[3;91mFalse[0m[1m)[0m
    [1m([0mh[1m)[0m: [1;35mModuleList[0m[1m([0m
      [1m([0m[1;36m0[0m-[1;36m11[0m[1m)[0m: [1;36m12[0m x [1;35mGPT2Block[0m[1m([0m
        [1m([0mln_1[1m)[0m: [1;35mLayerNorm[0m[1m([0m[1m([0m[1;36m768[0m,[1m)[0m, [33meps[0m=[1;36m1e[0m[1;36m-05[0m, [33melementwise_affine[0m=[3;92mTrue[0m[1m)[0m
        [1m([0mattn[1m)[0m: [1;35mGPT2Attention[0m[1m([0m
          [1m([0mc_attn[1m)[0m: [1;35mConv1D[0m[1m([0m[1m)[0m
          [1m([0mc_proj[1m)[0m: [1;35mConv1D[0m[1m([0m[1m)[0m
          [1m(