## Training T5 using Pytorch

The purpose of this notebook is to demonstrate training using tensorflow 2 and keras. This notebook includes tf Data pipelines for build any other NLP task in a text to text fashion. Anyone can adapt the data pipeline to thier own datasets. Uses the efficient [Datasets](https://github.com/huggingface/datasets) from 🤗 as source for training.
#### Features
- Train TF T5 on SQUAD questioning and answering
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- [Datasets from 🤗](https://github.com/huggingface/datasets) as source
- Log metrics using tensorboard
- Profile your experiment with the brand new tensorflow profiler !!

### Installation

In [33]:
from IPython.display import clear_output

!pip install transformers
!pip install datasets
!pip install torchmetrics
!pip install SentencePiece

clear_output()

In [34]:
from datasets import load_dataset
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import transformers
import datasets
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import datetime
import os
# %load_ext tensorboard

In [35]:
import math
import torch.optim
import torch
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
import torch
from tqdm import tqdm
from transformers import T5ForConditionalGeneration
import torch.nn as nn

In [36]:
import torch

# Check PyTorch version
torch_version = torch.__version__
print("PyTorch: ", torch_version)

# Check Transformers version
transformers_version = transformers.__version__
print("Transformers: ", transformers_version)

# Check Datasets version
datasets_version = datasets.__version__
print("Datasets: ", datasets_version)

# # Check if TensorFlow version is 2.3+
# tf_version_split = tf_version.split('.')
# assert int(tf_version_split[0]) == 2 and int(tf_version_split[-2]) >= 3, f"Tensorflow version should be '2.3+,x', given {tf_version}"

PyTorch:  2.0.0
Transformers:  4.29.1
Datasets:  2.12.0


In [37]:
# Get the GPU device
DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda:0"
elif torch.backends.mps.is_available():
    DEVICE = "mps"

device = torch.device(DEVICE)

# Check if the device is available
if device.type == "cuda":
  print("Using GPU")
elif device.type == "mps":
    print("Using Macbook GPU")
else:
  print("Using CPU")

Using Macbook GPU


In [38]:
!mkdir dataClosed
!ls -la

mkdir: dataClosed: File exists
total 240
drwxr-xr-x   6 ubs  admin     192 May 18 01:26 [34m.[m[m
drwxr-xr-x  19 ubs  admin     608 May 16 15:04 [34m..[m[m
-rw-r--r--@  1 ubs  admin    6148 May 16 15:48 .DS_Store
drwxr-xr-x  12 ubs  admin     384 May 17 16:51 [34mDataset[m[m
-rw-r--r--   1 ubs  admin  114439 May 18 01:26 closedBook2.ipynb
drwxr-xr-x   2 ubs  admin      64 May 16 15:27 [34mdataClosed[m[m


In [39]:
data_dir = "./dataClosed"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"

Downloading Dataset for Squad

In [40]:
from datasets import load_dataset
import json

# Load SQuAD dataset
squad = load_dataset("squad")


# Write data to text file

squad["validation"].to_json("./Dataset/squad_validation_v1.1.txt")
squad["train"].to_json("./Dataset/squad_train_v1.1.txt")


Using the latest cached version of the module from /Users/ubs/.cache/huggingface/modules/datasets_modules/datasets/squad/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Wed May 17 15:47:15 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.
Found cached dataset squad (/Users/ubs/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

85176316

### The Tokenizer

In [41]:
# from google.colab import drive
# drive.mount('/content/gdrive')


In [42]:
# %cd /content/gdrive/My\ Drive/Experiments/NLP/COT


In [62]:
from transformers import T5Tokenizer
model_name = "t5-base"
seq_length = 512

# model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_classes = tokenizer.vocab_size

tokenizer

T5TokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id

In [63]:
from transformers import AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("")




### Preparing the Data

In [64]:
# Read the  file and creating Training and Validation Datasets
import json
from datasets import Dataset


def readDataset  (file_path):
    data = []
    with open(file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            text =  json.loads(line)
            data.append(text)
    return  data

train_data = readDataset("./Dataset/squad_train_v1.1.txt")
validation_data = readDataset("./Dataset/squad_validation_v1.1.txt")

train_dataset = Dataset.from_list(train_data)
validation_dataset = Dataset.from_list(validation_data)


print(train_dataset)
print(validation_dataset)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})


In [65]:
validation_dataset[2]

{'id': '56be4db0acb8001400a502ee',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Where did Super Bowl 50 take place?',
 'answers': {'answer_start': [403, 355, 355],
  'text': ['Santa Clara, California',
   "Levi's

In [66]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import T5Tokenizer

class QADataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)#T5Tokenizer.from_pretrained('t5-base')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = example['question']
        answers = example['answers']['text']
        # Combine the answers into a single string separated by '|' character
        output_text = ' | '.join(answers)

        output_text = 'answer: ' + output_text
        input_text = 'context: '+ example['context'] +' question: ' + question

        # Tokenize the input and output texts
        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        output_ids = self.tokenizer.encode(output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)

        return input_ids, output_ids


In [67]:
batch_size = 16
num_workers = 0
pin_memory = True

In [68]:
train_dataloader = DataLoader(QADataset(train_dataset), batch_size=16, shuffle=True, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

print(train_dataloader)
valid_dataloader = DataLoader(QADataset(validation_dataset), batch_size=16, shuffle=False, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<torch.utils.data.dataloader.DataLoader object at 0x2f6962970>
<torch.utils.data.dataloader.DataLoader object at 0x2f6512190>


In [69]:
print("-"*50)
# Print first 5 examples of train_dataloader
print('Train examples:')
for i, (input_ids, output_ids) in enumerate(train_dataloader):
    if i == 5:
        break
    # Convert the input and output ids to lists of integers
    input_ids = input_ids[0].tolist()
    output_ids = output_ids[0].tolist()
    # Flatten the lists using the sum() function
    input_ids = sum(input_ids, [])
    output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

print("-"*50)
print("Print first 5 examples of valid_dataloader")
# Print first 5 examples of valid_dataloader
print('Valid examples:')
for i, (input_ids, output_ids) in enumerate(valid_dataloader):
    if i == 5:
        break
    # Convert the input and output ids to lists of integers
    input_ids = input_ids[0].tolist()
    output_ids = output_ids[0].tolist()
    # Flatten the lists using the sum() function
    input_ids = sum(input_ids, [])
    output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

--------------------------------------------------
Train examples:
Input: context: The findings and insights of speech perception and articulation research complicate the traditional and somewhat intuitive idea of interchangeable allophones being perceived as the same phoneme. First, interchanged allophones of the same phoneme can result in unrecognizable words. Second, actual speech, even at a word level, is highly co-articulated, so it is problematic to expect to be able to splice words into simple segments without affecting speech perception. question: What does splicing words affect?
Output: answer: speech perception

Input: context: But, like many colors, it also had a negative association, with heat, destruction and evil. A prayer to god Isis said: "Oh Isis, protect me from all things evil and red." The ancient Egyptians began manufacturing pigments in about 4000 BC. Red ochre was widely used as a pigment for wall paintings, particularly as the skin color of men. An ivory painter

### Training Parameters

In [70]:
import numpy as np
# warmup_steps = 1e4
# encoder_max_len = 250
# decoder_max_len = 54
# buffer_size = 1000
ntrain = len(train_dataset)
nvalid = len(validation_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)

Total Steps:  5475
Total Validation Steps:  661


### Defining the data pipeline

In [72]:
# You can change these values according to your needs
warmup_steps = 1e4 # Number of steps for the warmup phase
initial_lr = 0.01 # Initial learning rate
lr = 0.01 # Learning rate

# Define a custom function for the learning rate decay
def lr_lambda(step):
  m = max(warmup_steps, step)
  lr = initial_lr / math.sqrt(m)
  return lr

# Create an optimizer object using torch.optim.Adam
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Create a LambdaLR object and pass it your optimizer
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [73]:
# Create a SummaryWriter object and pass it the log_path
log_path = "./dataClosed/experiments/t5/logs"
writer = SummaryWriter(log_path)

# Write your own custom logging functions
def log_scalar(tag, value, step):
  # Log a scalar value
  writer.add_scalar(tag, value, step)

def log_histogram(tag, values, step):
  # Log a histogram of tensor values
  writer.add_histogram(tag, values, step)

def log_graph(model, input):
  # Log the model graph
  writer.add_graph(model, input)

# Write your own custom saving function
def save_model(model, epoch, val_loss, val_accuracy, val_f1):
  # Save the model checkpoint
  checkpoint_filepath = f"./dataClosed/experiments/t5/T5-{epoch:04d}-{val_loss:.4f}-{val_accuracy:.4f}-{val_f1:.4f}.ckpt"
  torch.save(model.state_dict(), checkpoint_filepath)

### Training

In [74]:
batch_size=16
vocab_size=32128
num_classes = 32128
num_epochs = 5

In [75]:
num_classes = 32128
# create an F1Score instance and move it to the same device as the input tensors
f1_score = torchmetrics.F1Score(task='multiclass', num_classes=num_classes)
f1_score = f1_score.to(device)
# create an Accuracy instance and move it to the same device as the input tensors
accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)
accuracy = accuracy.to(device)

num_epochs = 30
for epoch in range(num_epochs):
  # create a tqdm progress bar for the training loop
  train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train")
  for input_ids, output_ids in train_pbar:

    model.zero_grad()
    input_ids = input_ids.to(device)
    output_ids = output_ids.to(device)
    input_ids = input_ids.view(batch_size, seq_length)
    output_ids = output_ids.view(batch_size, seq_length)
    outputs = model(input_ids=input_ids, labels=output_ids)
    loss = outputs.loss
    logits = outputs.logits
    loss.backward()
    optimizer.step()
    scheduler.step()
    # flatten inputs and pass them to F1Score and Accuracy
    logits = logits.view(-1, num_classes)
    output_ids = output_ids.view(-1)
    train_f1 = f1_score(logits, output_ids)
    train_acc = accuracy(logits, output_ids)
    # update the progress bar with the metrics
    train_pbar.set_postfix({"loss": loss.item(), "f1": train_f1.item(), "acc": train_acc.item()})
  valid_loss = 0.0
  valid_f1 = 0.0
  valid_acc = 0.0
  # create a tqdm progress bar for the validation loop
  valid_pbar = tqdm(valid_dataloader, desc=f"Epoch {epoch+1}, Valid")
  with torch.no_grad():
    for input_ids, output_ids in valid_pbar:
      input_ids = input_ids.to(device)
      output_ids = output_ids.to(device)
      input_ids = input_ids.view(batch_size, seq_length)
      output_ids = output_ids.view(batch_size, seq_length)
      outputs = model(input_ids=input_ids, labels=output_ids)
      loss = outputs.loss
      logits = outputs.logits
      valid_loss += loss.item()
      # flatten inputs and pass them to F1Score and Accuracy
      logits = logits.view(-1, num_classes)
      output_ids = output_ids.view(-1)
      valid_f1 += f1_score(logits, output_ids).item()
      valid_acc += accuracy(logits, output_ids).item()
      # update the progress bar with the metrics
      valid_pbar.set_postfix({"loss": loss.item(), "f1": f1_score(logits, output_ids).item(), "acc": accuracy(logits, output_ids).item()})
    valid_loss /= len(valid_dataloader)
    valid_f1 /= len(valid_dataloader)
    valid_acc /= len(valid_dataloader)
  if epoch == 0 or valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    save_model(model, epoch, valid_loss, valid_acc, valid_f1)
  f1_score.reset()
  accuracy.reset()

  tp = tp.sum(dim=0 if multidim_average == "global" else 1)
Epoch 1, Train: 100%|██████████| 5474/5474 [12:18:31<00:00,  8.09s/it, loss=0.00619, f1=0.998, acc=0.998]  
Epoch 1, Valid: 100%|██████████| 660/660 [11:33<00:00,  1.05s/it, loss=0.095, f1=0.985, acc=0.985] 
Epoch 2, Train: 100%|██████████| 5474/5474 [7:42:49<00:00,  5.07s/it, loss=0.00338, f1=0.999, acc=0.999]     
Epoch 2, Valid: 100%|██████████| 660/660 [11:46<00:00,  1.07s/it, loss=0.0949, f1=0.984, acc=0.984]
Epoch 3, Train: 100%|██████████| 5474/5474 [16:16:25<00:00, 10.70s/it, loss=0.00452, f1=0.999, acc=0.999]     
Epoch 3, Valid: 100%|██████████| 660/660 [11:38<00:00,  1.06s/it, loss=0.0979, f1=0.984, acc=0.984]
Epoch 4, Train:  12%|█▏        | 633/5474 [47:31<6:03:23,  4.50s/it, loss=0.000845, f1=1, acc=1]       


KeyboardInterrupt: 

### Lets test our model!!

In [None]:
from datetime import datetime
dt = datetime.now()
path_of_saved_model = "./dataClosed/models/squad_"+model_name+"_"+str(dt)+".ckpt"

In [None]:
# model = T5ForConditionalGeneration.from_pretrained(path_of_saved_model)
model = T5ForConditionalGeneration.from_pretrained("/content/gdrive/My Drive/pre/itu-t5-base")
# model.to(device)

In [None]:
model.load_state_dict(torch.load(path_of_saved_model))

<All keys matched successfully>

In [None]:

question = "Where is Information Technology University located?"
print(question)

input_text =  f"question: {question} </s>"
encoded_query = tokenizer(input_text,  return_tensors='pt', padding='max_length', truncation=True, max_length=128)
input_ids = encoded_query["input_ids"]

generated_answer = model.generate(input_ids,  max_length=128, top_p=0.95, top_k=50, repetition_penalty=2.0)
decoded_answer = tokenizer.decode(generated_answer[0])

print("Answer: ", decoded_answer)

Where is Information Technology University located?
Answer:  <pad> Toronto</s>


In [None]:
question = "Who is the associate professor of computer science department?"
print(question)

input_text =  f"question: {question} </s>"
encoded_query = tokenizer(input_text,  return_tensors='pt', padding='max_length', truncation=True, max_length=128)
input_ids = encoded_query["input_ids"]

generated_answer = model.generate(input_ids,  max_length=128, top_p=0.95, top_k=50, repetition_penalty=2.0)
decoded_answer = tokenizer.decode(generated_answer[0])

print("Answer: ", decoded_answer)


Who is the associate professor of computer science department?
Answer:  <pad> dr.</s>


In [None]:
question = "Where is ITU's Admissions Office located?"

print(question)

input_text =  f"question: {question} </s>"
encoded_query = tokenizer(input_text,  return_tensors='pt', padding='max_length', truncation=True, max_length=128)
input_ids = encoded_query["input_ids"]

generated_answer = model.generate(input_ids,  max_length=128, top_p=0.95, top_k=50, repetition_penalty=2.0)
decoded_answer = tokenizer.decode(generated_answer[0])

print("Answer: ", decoded_answer)


Where is ITU's Admissions Office located?
Answer:  <pad> where is the ITU's Admissions Office</s>
