# Finetuning Question Answering on T5


**Fine_tuning T5 Extracitve Question Answering in PyTorch**

In [1]:
import torch
torch.cuda.empty_cache

<function torch.cuda.memory.empty_cache() -> None>

Install transformers Library

In [2]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

Import libraries

In [59]:
import numpy as np
import torch
from torch.optim import Adam
# from transformers import AutoTokenizer, AutoModelWithLMHead

from transformers import AutoTokenizer, T5ForConditionalGeneration

**1. Instantiate model**



In [61]:
model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

ValueError: ignored

### Load the dataset

In [36]:
from datasets import load_dataset

### Load and split dataset, using small datasets for the sake of model training

In [37]:
train_data, valid_data = load_dataset('squad', split='train[:1%]'), load_dataset('squad', split='validation[:3%]')



### Checking the features of the answers 

In [38]:
train_data.shape, valid_data.shape

((876, 5), (317, 5))

In [39]:
train_data[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

### Getting correct answer text alignment and tokenizing the dataset

In [40]:
# Dataset cleaning and tokenization


def correct_alignment(context, answer):

    """ Description: This functions corrects the alignment of answers in the squad dataset that are sometimes off by one or 2 values also adds end_postion index.
    
    inputs: list of contexts and answers
    outputs: Updated list that contains answer_end positions """
    
    start_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(start_text)

    # When alignment is okay
    if context[start_idx:end_idx] == start_text:
      return start_idx, end_idx    
      # When alignment is off by 1 character
    elif context[start_idx-1:end_idx-1] == start_text:
      return start_idx-1, end_idx-1  
      # when alignment is off by 2 characters
    elif context[start_idx-2:end_idx-2] == start_text:
      return start_idx-2, end_idx-2
    else:
      raise ValueError()

### Tokenize our training dataset

In [41]:
def convert_to_features(example_batch):
  """ Description: This functions tokenizes the context and questions then appends encoded start_positions and end_positions from the above function.
    
    inputs: list of contexts, questions and answers
    outputs: Updated list that contains answer_end positions """

    # Tokenize contexts and questions (as pairs of inputs)
  encodings = tokenizer(example_batch['context'], example_batch['question'], truncation=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
  start_positions, end_positions = [], []
  for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
    start_idx, end_idx = correct_alignment(context, answer)
    start_positions.append(encodings.char_to_token(i, start_idx))
    end_positions.append(encodings.char_to_token(i, end_idx-1))
    # update encodings   
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

  return encodings

### Map the dataset to the convert_function, faster than using for loops.


In [42]:
Training_encoded = train_data.map(convert_to_features, batched=True)
Validation_encoded = valid_data.map(convert_to_features, batched = True)

Map:   0%|          | 0/876 [00:00<?, ? examples/s]

Map:   0%|          | 0/317 [00:00<?, ? examples/s]

### Encoded features
- Our encoded dataset has some columns we don't need

In [43]:
Training_encoded.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'start_positions': Value(dtype='int64', id=None),
 'end_positions': Value(dtype='int64', id=None)}

### Format our encoded datasets to outputs torch.Tensor to train our pytorch model

In [44]:
columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
Training_encoded.set_format(type='torch', columns=columns)
Validation_encoded.set_format(type='torch', columns=columns)


In [45]:
column_names = ['answers', 'context', 'id', 'question', 'title']

Validation_encoded = Validation_encoded.remove_columns(column_names)
Training_encoded = Training_encoded.remove_columns(column_names)


### Loading the tensor data into dataloader.

In [46]:
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Instantiate a PyTorch Dataloader around our dataset
# Let's do dynamic batching (pad on the fly with our own collate_fn)
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors='pt')

### Dataloaders for training and validation

In [47]:
dataloader_val = DataLoader(Validation_encoded, collate_fn=collate_fn, batch_size= 4, sampler=SequentialSampler(Validation_encoded))
dataloader = DataLoader(Training_encoded, collate_fn=collate_fn, batch_size =4, sampler= RandomSampler(Training_encoded))

### Setting the seed for generating random numbers

In [48]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [49]:
# Validation function for the model

def model_validation(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    
    for batch in dataloader_val:
        
        batch = {k: v.to(device) for k, v in batch.items()} # Move the batch to the device

        with torch.no_grad():    
            outputs = model(input_ids=batch['input_ids'],
                            attention_mask=batch['attention_mask'],
                            start_positions=batch['start_positions'],
                            end_positions=batch['end_positions'])

        loss = outputs.loss
        loss_val_total += loss.item()

    return loss_val_total


### Optimizer

In [50]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=1e-5)


### Scheduler

In [51]:
from transformers import get_linear_schedule_with_warmup
epochs = 1
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_val)*epochs) 

### Training Loop

In [52]:
from tqdm.notebook import tqdm
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Specify the number of epochs and batch size
epochs = 2

model.to(device)

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = {k: v.to(device) for k, v in batch.items()} # Move the batch to the device

        # You need to provide start_positions and end_positions for training
        outputs = model(input_ids=batch['input_ids'], 
                        attention_mask=batch['attention_mask'],
                        start_positions=batch['start_positions'],
                        end_positions=batch['end_positions'])
        
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    torch.save(model.state_dict(), f'finetuned_Longformer_epoch_{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader)            
    tqdm.write(f'Training loss: {round(loss_train_avg, 2)}')
    
    # Assuming you have a model_validation function that works similarly
    val_loss = model_validation(dataloader_val) 
    val_loss_avg = val_loss/len(dataloader_val)
    tqdm.write(f'Validation loss: {round(val_loss_avg, 2)}')


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/219 [00:00<?, ?it/s]

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



Epoch 1
Training loss: 1.7
Validation loss: 1.4


Epoch 2:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.25
Validation loss: 1.4


**Evaluate the model on a test dataset**

In [53]:
def evaluate_model(dataloader_test):
    model.eval().to(device)
    predictions , true_labels = [], []

    for batch in dataloader_test:
        # Move batch to device
        for k in batch.keys():
            batch[k] = batch[k].to(device)

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.start_logits, outputs.end_logits
        logits = logits.detach().cpu().numpy()
        label_ids = batch['labels'].to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    # You may need to modify this part depending on how your labels are encoded
    pred_labels = np.argmax(predictions, axis=1)

    return true_labels, pred_labels
  




In [54]:

def answer_question(model, tokenizer, question, context):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    # Tokenize the question and context
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Perform the forward pass
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the start and end logits from the model's output
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the start and end indices with the highest logit scores
    start_index = torch.argmax(start_logits, dim=1).item()
    end_index = torch.argmax(end_logits, dim=1).item()

    # Decode the tokens within the predicted start and end indices
    answer = tokenizer.decode(inputs['input_ids'][0, start_index:end_index+1])

    return answer




**Inference**

In [55]:
# Usage example:
sample_question = "What does Chhauni Silkhana mean?"
sample_context = "The National Museum is located in the western part of Kathmandu, near the Swayambhunath stupa in an historical building. This building was constructed in the early 19th century by General Bhimsen Thapa. It is the most important museum in the country, housing an extensive collection of weapons, art and antiquities of historic and cultural importance. The museum was established in 1928 as a collection house of war trophies and weapons, and the initial name of this museum was Chhauni Silkhana, meaning \"the stone house of arms and ammunition\". Given its focus, the museum contains many weapons, including locally made firearms used in wars, leather cannons from the 18th–19th century, and medieval and modern works in wood, bronze, stone and paintings."

answer = answer_question(model, tokenizer, sample_question, sample_context)
print("Sample Question:", sample_question)
print("Sample Context:", sample_context)
print("Answer:", answer)




Sample Question: What does Chhauni Silkhana mean?
Sample Context: The National Museum is located in the western part of Kathmandu, near the Swayambhunath stupa in an historical building. This building was constructed in the early 19th century by General Bhimsen Thapa. It is the most important museum in the country, housing an extensive collection of weapons, art and antiquities of historic and cultural importance. The museum was established in 1928 as a collection house of war trophies and weapons, and the initial name of this museum was Chhauni Silkhana, meaning "the stone house of arms and ammunition". Given its focus, the museum contains many weapons, including locally made firearms used in wars, leather cannons from the 18th–19th century, and medieval and modern works in wood, bronze, stone and paintings.
Answer: the stone house of arms and ammunition


In [56]:
# Usage example:
sample_question = "During what era was the Hanumandhoka Palace constructed?"
sample_context = "The Tribhuvan Museum contains artifacts related to the King Tribhuvan (1906–1955). It has a variety of pieces including his personal belongings, letters and papers, memorabilia related to events he was involved in and a rare collection of photos and paintings of Royal family members. The Mahendra Museum is dedicated to king Mahendra of Nepal (1920–1972). Like the Tribhuvan Museum, it includes his personal belongings such as decorations, stamps, coins and personal notes and manuscripts, but it also has structural reconstructions of his cabinet room and office chamber. The Hanumandhoka Palace, a lavish medieval palace complex in the Durbar, contains three separate museums of historic importance. These museums include the Birendra museum, which contains items related to the second-last monarch, Birendra of Nepal."

answer = answer_question(model, tokenizer, sample_question, sample_context)
print("Sample Question:", sample_question)
print("Sample Context:", sample_context)
print("Answer:", answer)

Sample Question: During what era was the Hanumandhoka Palace constructed?
Sample Context: The Tribhuvan Museum contains artifacts related to the King Tribhuvan (1906–1955). It has a variety of pieces including his personal belongings, letters and papers, memorabilia related to events he was involved in and a rare collection of photos and paintings of Royal family members. The Mahendra Museum is dedicated to king Mahendra of Nepal (1920–1972). Like the Tribhuvan Museum, it includes his personal belongings such as decorations, stamps, coins and personal notes and manuscripts, but it also has structural reconstructions of his cabinet room and office chamber. The Hanumandhoka Palace, a lavish medieval palace complex in the Durbar, contains three separate museums of historic importance. These museums include the Birendra museum, which contains items related to the second-last monarch, Birendra of Nepal.
Answer:  medieval


## Giving Correct answer to the Questions...