<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/Training_Pipeline_Seq2Seq_AdamW_(Continuously_Updated).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! pip install -q transformers datasets

In [3]:
# Borrowed from:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization_no_trainer.py

# This is my cleaned version

In [1]:
import random
import os
import math

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import get_scheduler
from transformers.optimization import Adafactor, AdafactorSchedule
from datasets import load_dataset
from tqdm.auto import tqdm

In [2]:
dataset = load_dataset("squad")

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model_name = "facebook/bart-base"

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
model.eval()

params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {params}")

Total parameters: 139420416


In [4]:
# For Seq2Seq Task, I am maintaining two lists for train & valid parts

limit = 1000 # Supply limit due to hardware constraints
train_inputs, train_targets = [], []
for sample in dataset["train"]:
  context = sample["context"]
  question = sample["question"]
  train_inputs.append(context)
  train_targets.append(question)


valid_inputs, valid_targets = [], []
for sample in dataset["validation"]:
  context = sample["context"]
  question = sample["question"]
  valid_inputs.append(context)
  valid_targets.append(question)
assert len(valid_inputs) == len(valid_targets)

if limit is not None:
  train_inputs = train_inputs[:limit]
  train_targets = train_targets[:limit]

  valid_inputs = valid_inputs[:limit]
  valid_targets = valid_targets[:limit]
assert len(train_inputs) == len(train_targets)

print(f"Total Train Samples: {len(train_inputs)}")
print(f"Total Valid Samples: {len(valid_inputs)}")

Total Train Samples: 1000
Total Valid Samples: 1000


In [5]:
# Sanity check Train + Valid list
index = random.randint(0, len(train_inputs))

print(f"Train Context --> {train_inputs[index]}")
print(f"Train Question --> {train_targets[index]}\n")

print(f"Valid Context --> {valid_inputs[index]}")
print(f"Valid Question --> {valid_targets[index]}\n")

Train Context --> Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Train Question --> What is Beyonce's full name?

Valid Context --> Manning finished the year with a career-low 67.9 passer rating, throwing for 2,249 yards and nine touchdowns, with 17 interceptions. In contrast, Osweiler threw for 1,967 yards, 10 touchdowns and six interceptions for a 

In [6]:
class CustomDataset(Dataset):
  def __init__(self,
               tokenizer,
               input_texts,
               target_texts,
               max_input_length,
               max_target_length,
               ):
    self.tokenizer = tokenizer

    self.input_texts = input_texts
    self.target_texts = target_texts
    assert len(self.input_texts) == len(self.target_texts)

    self.max_input_length = max_input_length
    self.max_target_length = max_target_length
  
  def __len__(self,):
    return len(self.input_texts)

  
  def __getitem__(self, idx):
    input_texts = self.input_texts[idx]
    target_texts = self.target_texts[idx]
    return {
        "input_ids": input_texts,
        "labels": target_texts
    }
  
  def collate_fn(self, batch):
    input_texts, target_texts = [], []
    for sample in batch:
      input_texts.append(sample["input_ids"])
      target_texts.append(sample["labels"])
    
    tokens_input = self.tokenizer(input_texts,
                    max_length=self.max_input_length,
                    padding=True,
                    truncation=True,
                    return_tensors='pt'
                    )
        
    with self.tokenizer.as_target_tokenizer():
        tokens_target = self.tokenizer(target_texts,
              max_length=self.max_target_length,
              padding=True,
              truncation=True,
              return_tensors='pt'
              )
    if self.tokenizer.pad_token_id is not None:
        tokens_target = tokens_target["input_ids"]
        tokens_target[tokens_target == self.tokenizer.pad_token_id] = -100
    return {
            'input_ids': tokens_input['input_ids'],
            'attention_mask': tokens_input['attention_mask'],
            'labels': tokens_target,
            }

In [7]:
# Hyperparameters
batch_size = 8

In [8]:
# Check sample outputs from DataLoader
train_dataset = CustomDataset(tokenizer=tokenizer,
                               input_texts=train_inputs,
                               target_texts=train_targets,
                               max_input_length=512,
                               max_target_length=40)
train_loader = DataLoader(train_dataset, 
                           batch_size=batch_size, 
                           shuffle=True,
                           collate_fn=train_dataset.collate_fn)

valid_dataset = CustomDataset(tokenizer=tokenizer,
                               input_texts=valid_inputs,
                               target_texts=valid_targets,
                               max_input_length=512,
                               max_target_length=40)
valid_loader = DataLoader(valid_dataset, 
                           batch_size=batch_size, 
                           shuffle=True,
                           collate_fn=valid_dataset.collate_fn)
print(f"Length of Train Loader: {len(train_loader)}")
print(f"Length of Valid Loader: {len(valid_loader)}") 

Length of Train Loader: 125
Length of Valid Loader: 125


In [9]:
class Trainer:
  def __init__(self,
               tokenizer,
               model,
               optimizer: str,
               train_loader: DataLoader,
               valid_loader: DataLoader,
               epochs: int = 3,
               gradient_accumulation_steps: int = 1,
               ):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.tokenizer = tokenizer
    self.model = model
    self.model.to(self.device)
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.epochs = epochs
    self.gradient_accumulation_steps = gradient_accumulation_steps

    self.num_training_steps = self.epochs * len(self.train_loader)
    self.num_training_steps = math.ceil(self.num_training_steps / self.gradient_accumulation_steps)
    if optimizer == "adamw":
      self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-5)
      self.scheduler = get_scheduler(name="linear",
                                     optimizer=self.optimizer,
                                     num_warmup_steps=0,
                                     num_training_steps=self.num_training_steps)
    elif optimizer == "adafactor":
      self.optimizer = Adafactor(self.model.parameters(),
                                 scale_parameter=True,
                                 relative_step=True,
                                 warmup_init=True,
                                 lr=None)
      self.scheduler = AdafactorSchedule(self.optimizer)
    else:
      raise NotImplementedError(f"Optimizer not available for {optimizer}")
  
  def train(self,):
    progress_bar = tqdm(range(self.num_training_steps))
    total_loss_list = []
    for epoch in range(self.epochs):
      self.model.train()
      for step, batch in enumerate(self.train_loader):
        batch = {k: v.to(self.device) for k, v in batch.items()}

        outputs = self.model(**batch)
        loss = outputs.loss
        total_loss_list.append(loss.detach().float().item())

        loss = loss / self.gradient_accumulation_steps
        loss.backward()

        if step % self.gradient_accumulation_steps == 0 or step == len(self.train_loader) - 1:
          self.optimizer.step()
          self.scheduler.step()
          self.optimizer.zero_grad()
          progress_bar.update(1)
      
      validation_loss = self.validate_after_each_epoch()
      print(f"Epoch: {epoch} --- Training Loss: {np.mean(total_loss_list): .4f} --- Validation Loss: {validation_loss: .4f}")
      self.save_after_epoch(epoch)
  
  def validate_after_each_epoch(self,) -> float:
    self.model.eval()
    total_loss_list = []
    with torch.set_grad_enabled(False):
      for batch in tqdm(self.valid_loader, leave=False):
        batch = {k: v.to(self.device) for k, v in batch.items()}

        outputs = self.model(**batch)
        loss = outputs.loss
        total_loss_list.append(loss.detach().float().item())
    
    return np.mean(total_loss_list)
  
  def save_after_epoch(self, epoch: int):
    if not os.path.isdir("saved_models"):
      os.mkdir("saved_models")
    
    path = f"saved_models/epoch_{epoch}"
    os.mkdir(path)
    self.tokenizer.save_pretrained(path)
    self.model.save_pretrained(path)

In [10]:
trainer = Trainer(tokenizer=tokenizer,
                  model=model,
                  optimizer="adafactor",
                  train_loader=train_loader,
                  valid_loader=valid_loader,
                  gradient_accumulation_steps=2
                  )

In [11]:
trainer.train()

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 0 --- Training Loss:  4.0196 --- Validation Loss:  2.5293


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 1 --- Training Loss:  3.2746 --- Validation Loss:  2.0015


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 2 --- Training Loss:  2.9168 --- Validation Loss:  1.9481


In [12]:
! rm -r saved_models/