In [1]:
import os

!pip install lightning
import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from lightning.pytorch.callbacks import ModelCheckpoint
from rich import print
from sklearn.datasets import fetch_20newsgroups
from torch.utils.data import DataLoader
from transformers import GPT2Config, GPT2Model, GPT2Tokenizer

%load_ext rich

Collecting lightning
  Downloading lightning-2.2.5-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading lightning-2.2.5-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.2.5


## Prepare dataset

### Load dataset

In [2]:

# path = os.path.join(os.getcwd(),"data")
path = "/kaggle/input/20newsgroups/"
train = fetch_20newsgroups(data_home=path, subset="train")
test = fetch_20newsgroups(data_home=path, subset="test")


In [3]:
# Configuration parameters for model training
config = {
    "BATCH_SIZE": 16,
    "MAX_LENGTH": 512,
    "LEARNING_RATE": 1e-5,
    "N_EMBED": 768,
    "N_HEADS": 2,
    "N_BLOCKS": 12,
    "DROPOUT": 0.2,
    "NUM_LABELS": 20
}

device = "cuda"
# device = "mps" if torch.backends.mps.is_available() else "cpu"


### Create Tokenizer and Data Transformation Method

In [4]:
# Initialize the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Set the pad token to the end of sentence token
tokenizer.pad_token = tokenizer.eos_token
# Add the pad token to the special tokens
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})


# Define the Dataset class for the transformer model
class Dataset(torch.utils.data.Dataset):
    # Initialize the Dataset class
    def __init__(self, data, tokenizer, max_length, target):
        # Set the data, tokenizer, max_length, texts, and labels attributes
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = [text for text in data]  # Corrected attribute name from self.text to self.texts
        self.labels = target

    # Define the length method for the Dataset class
    def __len__(self):
        return len(self.data)

    # Define the classes method for the Dataset class
    def classes(self):
        return self.labels

    # Define the getitem method for the Dataset class
    def __getitem__(self, index):
        # Tokenize the text at the given index
        text_batch = tokenizer(
            self.texts[index],
            padding="max_length",
            max_length=config["MAX_LENGTH"],
            truncation=True,
            return_tensors="pt",
        )
        # Get the target label at the given index
        target_batch = np.array(self.labels[index])
        # Return the tokenized text and the target label
        return text_batch, target_batch


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Process train test data

In [5]:
# Split the data into training and validation sets
train_idx, val_idx = np.split(
    np.random.permutation(len(train.data)), [int(0.8 * len(train.data))]
)

# Create datasets for training, validation, and testing
# The Dataset class is defined earlier in the code
train_dataset = Dataset([train.data[i] for i in train_idx], tokenizer, config['MAX_LENGTH'], train.target[train_idx])
val_dataset = Dataset([train.data[i] for i in val_idx], tokenizer, config['MAX_LENGTH'], train.target[val_idx])
test_dataset = Dataset(test.data, tokenizer, config['MAX_LENGTH'], test.target)

# Create data loaders for training, validation, and testing
# The DataLoader class is from PyTorch's torch.utils.data module
train_loader = DataLoader(train_dataset, batch_size=config['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['BATCH_SIZE'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=config['BATCH_SIZE'], shuffle=False)


## Creating components of transformer model

In [6]:
# This class defines the attention layer in the transformer model.
class AttentionLayer(nn.Module):
    def __init__(self, head_size, config):
        # Initialize the attention layer
        super().__init__()
        # Define the query, key, and value layers
        self.query_layer = nn.Linear(config["N_EMBED"], head_size, bias=False)
        self.key_layer = nn.Linear(config["N_EMBED"], head_size, bias=False)
        self.value_layer = nn.Linear(config["N_EMBED"], head_size, bias=False)
        # Define the dropout layer
        self.dropout = nn.Dropout(config["DROPOUT"])

    def forward(self, x):
        # Get the batch size, sequence length, and embedding size
        B, T, C = x.shape

        # Compute the query, key, and value matrices
        q = self.query_layer(x)
        k = self.key_layer(x)
        v = self.value_layer(x)

        # Compute the attention weights
        weights = q @ k.transpose(-2, -1) * (C**-0.5)

        # Apply softmax and dropout to the attention weights
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)

        # Compute the output
        out = weights @ v
        return out
    
# This class defines the multi-head attention layer in the transformer model.
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size, config):
        # Initialize the multi-head attention layer
        super().__init__()
        # Define the attention heads
        self.heads = nn.ModuleList([AttentionLayer(head_size, config) for _ in range(n_heads)])
        # Define the projection layer
        self.proj = nn.Linear(config["N_EMBED"], config["N_EMBED"])
        # Define the dropout layer
        self.dropout = nn.Dropout(config["DROPOUT"])

    def forward(self, x):
        # Compute the output of each attention head
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        # Apply the projection layer
        out = self.proj(out)
        # Apply dropout
        out = self.dropout(out)
        return out

# This class defines the feed forward layer in the transformer model.
class FeedForward(nn.Module):
    def __init__(self, config, scale_factor=1):
        # Initialize the feed forward layer
        super().__init__()
        # Define the feed forward network
        self.net = nn.Sequential(
            nn.Linear(config["N_EMBED"], scale_factor * config["N_EMBED"]),
            nn.GELU(),
            nn.Linear(scale_factor * config["N_EMBED"], config["N_EMBED"]),  # Projection layer
            nn.Dropout(config["DROPOUT"]),
        )

    def forward(self, x):
        # Compute the output of the feed forward network
        return self.net(x)

# This class defines the transformer block in the transformer model.
class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_heads):
        # Initialize the transformer block
        super().__init__()
        # Define the self-attention layer
        self.sa_heads = MultiHeadAttention(n_heads, n_embed // n_heads, config)
        # Define the layer normalization layer
        self.ln1 = nn.LayerNorm(n_embed)

        # Define the feed forward layer
        self.ffwd = FeedForward(config)
        # Define the layer normalization layer
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        # Compute the output of the self-attention layer
        x = x + self.sa_heads(self.ln1(x))
        # Compute the output of the feed forward layer
        x = x + self.ffwd(self.ln2(x))
        return x 

In [7]:
# This class defines the transformer classifier model.
class TransformerClassifier(L.LightningModule):
    def __init__(self, hidden_size, num_classes, max_seq_len, n_heads, n_layers, lr=1e-5):
        # Initialize the transformer classifier model
        super().__init__()

        # Load the GPT2 model configuration and model
        self.gpt2config = GPT2Config.from_pretrained("gpt2", n_layer=n_layers)
        self.gpt2 = GPT2Model.from_pretrained("gpt2", config=self.gpt2config)

        # Initialize the transformer block
        self.tf_block = TransformerBlock(hidden_size, n_heads)
        # Initialize the classifier
        self.classifier = nn.Linear(hidden_size * max_seq_len, num_classes)
        # Set the learning rate
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        # Compute the output of the GPT2 model
        gpt_out = self.gpt2(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=True
        ).last_hidden_state

        # Pass through transformer block
        gpt_out = self.tf_block(gpt_out)

        # Flatten and pass through classifier layer
        logits = self.classifier(gpt_out.view(gpt_out.size(0), -1))
        return logits

    def training_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        # Print a random sample if it's the first batch
        if batch_idx == 0:
            r_idx = np.random.randint(0, len(y))  # Random index

            print(
                f"Input: {tokenizer.decode(input_ids[r_idx], skip_special_tokens=True)}"
            )
            print(f"Label: {y[r_idx]}")
            print(f"Prediction: {logits.argmax(1)[r_idx]}")

        return loss

    def test_step(self, batch, batch_idx):
        # Get the input and target
        x, y = batch
        input_ids = x["input_ids"].squeeze(1).to(device)
        attention_mask = x["attention_mask"].to(device)
        y = y.to(device).long()

        # Compute the logits
        logits = self(input_ids, attention_mask)
        # Compute the loss
        loss = F.cross_entropy(logits, y)

        # Compute the accuracy
        acc = (logits.argmax(1) == y).float().mean()

        # Log the loss and accuracy
        self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("test_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        # Configure the optimizer
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer



## Create the transformer model

In [8]:
trainer = L.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=5,
    callbacks=[ModelCheckpoint(monitor="val_acc", mode="max")],
)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [12]:
trailightning_logs/fit(model, train_loader, val_loader)

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

2024-06-09 06:18:57.715649: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 06:18:57.715788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 06:18:57.893634: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory fo

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('train_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=5` reached.


In [19]:
# Create an instance of the TransformerClassifier model and move it to the device
model = TransformerClassifier(
    config['N_EMBED'], config['NUM_LABELS'], config['MAX_LENGTH'], config['N_HEADS'], config['N_BLOCKS'], lr=config['LEARNING_RATE']
).to(device)

# Load the state dictionary of the model
state_dict = torch.load("/kaggle/working/lightning_logs/version_3/checkpoints/epoch=4-step=1415.ckpt")
# state_dict = torch.load("./model/gpt2_classifier_model_.ckpt")
model.load_state_dict(state_dict["state_dict"])

# Test the model on the test_loader
trainer.test(model, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: |          | 0/? [00:00<?, ?it/s]

[1m[[0m[1m{[0m[32m'test_loss_epoch'[0m: [1;36m0.7743063569068909[0m, [32m'test_acc_epoch'[0m: [1;36m0.8333775401115417[0m[1m}[0m[1m][0m

In [18]:
# Get the next batch of data from the test_loader
X, y = next(iter(test_loader))

# Get the logits for the first input in the batch
logits = model(X["input_ids"][1].to(device), X["attention_mask"][1].to(device))

# Print the decoded input, actual label and predicted label
print(tokenizer.decode(X["input_ids"][1][0], skip_special_tokens=True))
print(f"Actual Label: {y[1]}")
print(f"Predicted Label: {logits.argmax(1).item()}")

In [4]:
import os

In [5]:
files = os.listdir('/kaggle/working/lightning_logs/version_3/checkpoints')
files

['epoch=4-step=1415.ckpt']

In [9]:
from IPython.display import FileLink
FileLink('/kaggle/working/lightning_logs/version_3/checkpoints/epoch=4-step=1415.ckpt')

In [10]:
import shutil

# Specify the directory you want to zip
output_dir = '/kaggle/working/lightning_logs/version_3/checkpoints'
zip_file = '/kaggle/working/checkpoints.zip'

# Zip the directory
shutil.make_archive(zip_file.replace('.zip', ''), 'zip', output_dir)

[32m'/kaggle/working/checkpoints.zip'[0m

In [1]:
model

NameError: name 'model' is not defined