In [1]:
# install the required packages
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# import related packages
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [4]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# load the dataset
dataset = load_dataset("covid_qa_deepset")

columns_to_remove = ["document_id", "context", "is_impossible", "id"]
dataset = dataset.remove_columns(columns_to_remove)

dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['validation'] = dataset['test']
del dataset['test']

dataset = dataset.map(lambda x: {'answer': x['answers']['text'][0]})
dataset = dataset.remove_columns("answers")
print(dataset['train'][0])

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

{'question': 'Which isotope labeled lysine?', 'answer': 'heavy (R6K6)'}


In [6]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], max_length=256, truncation=True, padding='max_length')
   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [7]:
# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names, batched=True)

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [8]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [9]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds
   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
   return result

In [10]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 100

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./drive/MyDrive/colab/Flan-T5/model/flan-t5-COVID-QA",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   logging_steps = 10,
   save_steps = 100,
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.3871,0.324697,0.086858,0.008007,0.079501,0.080291
2,0.2738,0.308454,0.108346,0.016838,0.097019,0.097079
3,0.2842,0.3032,0.113148,0.015801,0.097406,0.098392
4,0.2675,0.303942,0.121089,0.018035,0.106834,0.107837
5,0.2199,0.307555,0.123467,0.020773,0.107643,0.108921
6,0.2311,0.311538,0.122219,0.021224,0.107426,0.108683
7,0.1948,0.319455,0.123114,0.021941,0.10768,0.108597
8,0.1693,0.325045,0.136941,0.020832,0.117572,0.118793
9,0.1571,0.341219,0.137558,0.024806,0.122278,0.122931
10,0.1243,0.353651,0.129242,0.022978,0.116067,0.116996




TrainOutput(global_step=5700, training_loss=0.07605642524008688, metrics={'train_runtime': 8461.5141, 'train_samples_per_second': 21.474, 'train_steps_per_second': 0.674, 'total_flos': 6.22102054699008e+16, 'train_loss': 0.07605642524008688, 'epoch': 100.0})

In [11]:
# load models
MODEL_NAME = "./drive/MyDrive/colab/Flan-T5/model/flan-t5-COVID-QA/checkpoint-5700"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
model.to("cuda")
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [12]:
# use full dataset
dataset = load_dataset("covid_qa_deepset")
columns_to_remove = ["document_id", "context", "is_impossible", "id"]
dataset = dataset.remove_columns(columns_to_remove)

dataset = dataset.map(lambda x: {'answer': x['answers']['text'][0]})
dataset = dataset.remove_columns("answers")
print(dataset)
# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names, batched=True)

Map:   0%|          | 0/2019 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2019
    })
})


Map:   0%|          | 0/2019 [00:00<?, ? examples/s]

In [13]:
# evaluate - entire dataset
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   eval_dataset=tokenized_dataset["train"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)
trainer.evaluate()



{'eval_loss': 0.08509615808725357,
 'eval_rouge1': 0.7820091172393914,
 'eval_rouge2': 0.6933126996229277,
 'eval_rougeL': 0.7801189602524832,
 'eval_rougeLsum': 0.7798700108974759,
 'eval_runtime': 317.0904,
 'eval_samples_per_second': 6.367,
 'eval_steps_per_second': 1.593}

In [14]:
prefix = "Please answer this question: "
input_text = prefix + 'How to avoid covid-19?'
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids=input_ids, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Reduce or eliminate covid-19 transmission
