In [1]:
%env CUDA_VISIBLE_DEVICES="1,2,3"

env: CUDA_VISIBLE_DEVICES="1,2,3"


In [2]:
"""Train T5 model on a given dataset, for answer aware question generation task. (Basically a sequence to sequence task)"""

import os
import gc
# set torch not to use 0th GPU. use only 1st, 2nd, 3rd, GPU.
import datasets
import transformers
from peft import LoraConfig, TaskType, get_peft_model
import torch
from transformers import BitsAndBytesConfig


TOKEN_QUESTION = '<question>'
TOKEN_END_QUESTION = '<question>'
TOKEN_CONTEXT = '<context>'
TOKEN_END_CONTEXT = '<context>'
TOKEN_ANSWER = '<answer>'
TOKEN_END_ANSWER = '<answer>'  
SPLIT_SEED = 42
NPROC = 32

# this works fine with deepspeed.
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "v"],
)

model_name = "t5-base"

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/storagenfs/l.miglior/answer-aware-question-generation/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.T5ForConditionalGeneration.from_pretrained(model_name, device_map='auto')
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
gc.collect()
torch.cuda.empty_cache()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953450861557152


In [4]:
# set torch not to use 0th GPU. use only 1st, 2nd, 3rd, GPU.
def preprocess_squad_dataset(dataset_name='squad', split='train'):
    dataset = datasets.load_dataset(dataset_name, split=split)
    # Add question, answer and context tokens to dataset in a new column named text
    dataset = dataset.map(
        lambda e: {
            # answer + context
            'inputs': f'generate question: {TOKEN_ANSWER} {e["answers"]["text"][0]} {TOKEN_END_ANSWER} {TOKEN_CONTEXT} {e["context"]} {TOKEN_END_CONTEXT}', 
            # question
            'target': f'{TOKEN_QUESTION} {e["question"]} {TOKEN_END_QUESTION}'},
            num_proc=NPROC
        )
    
    # Remove unnecessary columns, leaving only the formatted_text column
    dataset = dataset.remove_columns(['answers', 'context', 'question'])
    return dataset

# load dataset
dataset = preprocess_squad_dataset(dataset_name='squad', split='train')
valid_dataset = preprocess_squad_dataset(dataset_name='squad', split='validation')
train, validation = dataset, valid_dataset

In [5]:

# create a tokenizer function
def tokenize_function(example, max_context_length=512, max_question_length=32):
# Combine context and question
    # Tokenize input (context + answer)
    inputs = tokenizer(example['inputs'], max_length=(max_context_length), return_tensors="pt", padding="max_length", truncation=True)
    labels = tokenizer(example['target'], max_length=max_question_length, return_tensors="pt", padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

# tokenize dataset
tokenized_dataset_train = train.map(
    tokenize_function,
    batched=True,
    num_proc=32,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

tokenized_dataset_validation = validation.select(range(1000)).map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

In [6]:
training_args = transformers.TrainingArguments(
    output_dir=f'./results/{model_name}',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=128,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f'./logs/{model_name}',            # directory for storing logs
    do_eval=True,                    # do evaluation
                        # use mixed precision trainin
    report_to='tensorboard',
    logging_steps=10,
    eval_steps=50,
    evaluation_strategy='steps',
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_validation,
    #callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=3)],
)

# save model
trainer.train()
model.save_pretrained(f"./models/{model_name}")
tokenizer.save_pretrained(f"./models/{model_name}")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss


('./models/t5-small/tokenizer_config.json',
 './models/t5-small/special_tokens_map.json',
 './models/t5-small/tokenizer.json')

## Evaluate

In [10]:
del model
model = transformers.T5ForConditionalGeneration.from_pretrained(f"./models/{model_name}/")
tokenizer = transformers.AutoTokenizer.from_pretrained(f"./models/{model_name}/")
torch.cuda.empty_cache()

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './models/t5-base/'. Use `repo_type` argument if needed.

In [93]:
import evaluate
from tqdm import tqdm

predictions = []
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def generate_question(example, max_length=32):
    ids = tokenizer.encode(example, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(input_ids=ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_question_generation(dataset, max_length=32):
    for example in tqdm(dataset):
        predictions.append(generate_question(example['inputs'], max_length=max_length))


dataset = preprocess_squad_dataset(dataset_name='squad', split='validation').select(range(1000))    
evaluate_question_generation(dataset, max_length=32)

100%|██████████| 1000/1000 [10:38<00:00,  1.57it/s]


In [94]:
predictions = [prediction.replace('question>', '').replace('<question>', '') for prediction in predictions]
targets = [target.replace('<question>', '') for target in dataset['target']]

In [95]:
predictions[6]

' When was Super Bowl 50 played? '

In [96]:
targets[6]

' What day was the game played on? '

In [97]:
score = bertscore.compute(predictions=predictions, references=targets, lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
score.keys()

dict_keys(['precision', 'recall', 'f1', 'hashcode'])

In [104]:
import numpy as np
np.mean(score['f1'])

0.8993794137835502

In [99]:
rouge.compute(predictions=predictions, references=targets)

{'rouge1': 0.3940472830525471,
 'rouge2': 0.2026262090775079,
 'rougeL': 0.3640860483908057,
 'rougeLsum': 0.36411097278438986}

In [107]:
bleurt = evaluate.load("bleurt", module_type="metric")



ModuleNotFoundError: No module named 'bleurt'