<a href="https://colab.research.google.com/github/jonkrohn/NLP-with-LLMs/blob/main/code/Single-GPU-T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Single GPU T5

In this notebook, we train a T5 LLM architecture to be able to convert integers (e.g., `2003`) into their corresponding string (e.g., `two thousand three`).

### Imports

In [None]:
%%capture
!pip install nvidia-ml-py3==7.352.0 pytorch-lightning==2.0.1.post0 transformers==4.28.0 torchvision==0.15.1 rouge-score==0.1.2 tensorboardx==2.6 accelerate==0.18.0 deepspeed==0.9.0 peft==0.2.0

In [None]:
import pytorch_lightning as pl
from transformers import T5ForConditionalGeneration
from transformers import AutoTokenizer
from rouge_score import rouge_scorer
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
import os
from peft import get_peft_model, LoraConfig, TaskType
import torch
import argparse
from pytorch_lightning.loggers import TensorBoardLogger
import os, json
from torch.utils.data import Dataset

### Import Helper Functions

In [None]:
!git clone https://github.com/shaankhosla/NLP_with_LLMs/
%cd "NLP_with_LLMs"

Cloning into 'NLP_with_LLMs'...
remote: Enumerating objects: 113, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 113 (delta 65), reused 68 (delta 30), pack-reused 0[K
Receiving objects: 100% (113/113), 117.31 KiB | 2.39 MiB/s, done.
Resolving deltas: 100% (65/65), done.
/content/NLP_with_LLMs


In [None]:
import gpu_utilities, generate_data

### Assumptions

In [None]:
MODEL_NAME = 't5-small'

### Generate Data

In [None]:
generate_data.main(num_train=1000, num_val=200)

Generating synthetic dataset (1000 train, 200 val)...




In [None]:
!cat './data/train/0.json'

{"number": 2003, "words": "two thousand three"}

In [None]:
class StreamingDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME, cache_dir='./cache/', use_fast=True
        )

    def __len__(self):
        return len(os.listdir(self.path))

    def encode_text(self, context, text):
        ctext = str(context)
        ctext = " ".join(ctext.split())
        text = str(text)
        text = " ".join(text.split())
        source = self.tokenizer.batch_encode_plus(
            [ctext],
            max_length=512,
            truncation=True,
            # pad_to_max_length=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [text],
            max_length=150,
            truncation=True,
            # pad_to_max_length=True,
            padding="max_length",
            return_tensors="pt",
        )
        y = target["input_ids"]
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[
            y[:, 1:] == self.tokenizer.pad_token_id
        ] = -100  # in case the labels are not provided, empty string
        return source["input_ids"], source["attention_mask"], target_id, target_label

    def __getitem__(self, idx):
        file_path = os.path.join(self.path, str(idx) + ".json")
        with open(file_path, "r") as infile:
            data = json.load(infile)
        number, words = str(data["number"]), data["words"]
        return self.encode_text(number, words)

In [None]:
train_data = StreamingDataset('./data/train/')
val_data = StreamingDataset('./data/val')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
for i in range(3):
    sample = train_data[i]
    print(sample, '\n')

(tensor([[3888,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, 

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([torch.flatten(x[0]) for x in batch])
    sequence_mask = torch.stack([torch.flatten(x[1]) for x in batch])
    target_ids = torch.stack([torch.flatten(x[2]) for x in batch])
    target_label = torch.stack([torch.flatten(x[3]) for x in batch])
    return input_ids, sequence_mask, target_ids, target_label

### Set up Lightning training module

In [None]:
class T5Finetuner(pl.LightningModule):
    def __init__(self, model, args, train_data, val_data):
        super().__init__()
        self.model = model
        self.args = args
        self.tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME, cache_dir='./cache/', use_fast=True
        )
        self.train_data, self.val_data = train_data, val_data
        self.scorer = rouge_scorer.RougeScorer(
            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
        )

    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch
        return self.model(
            input_ids=source_ids,
            attention_mask=source_mask,
            decoder_input_ids=target_ids,
            labels=target_labels,
        )

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss}

    def train_dataloader(self):
        return DataLoader(
            self.train_data,
            batch_size=self.args['batch_size'],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=collate_fn,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_data,
            batch_size=self.args['batch_size'],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=collate_fn,
        )

    def configure_optimizers(self):
        optimizer = AdamW(
            self.trainer.model.parameters(), lr=self.args['lr'], weight_decay=0.01
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=100,
            num_training_steps=self.args['epochs']
            * len(self.train_data)
            / self.args['batch_size'],
        )
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

### Vanilla Training

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)
trainer = pl.Trainer(
    max_epochs=args['epochs'],
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 395 MB.


### Gradient Checkpointing

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)
trainer = pl.Trainer(
    max_epochs=args['epochs'],
)

trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 395 MB.


### Gradient Accumulation

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 395 MB.


### Mixed Precision

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 401 MB.


### PEFT with LoRA
 
Parameter-Efficient Fine-Tuning with Low-Rank Adaptation

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.enable_input_require_grads()
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
hg_model = get_peft_model(hg_model, peft_config)
hg_model.print_trainable_parameters()

trainable params: 294912 || all params: 60801536 || trainable%: 0.4850403779272945


In [None]:
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                  | Params
------------------------------------------------
0 | model | PeftModelForSeq2SeqLM | 60.8 M
------------------------------------------------
294 K     Trainable params
60.5 M    Non-trainable params
60.8 M    Total params
243.206   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 401 MB.
