In [1]:
import nltk
import pandas as pd
from transformers import AutoTokenizer
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_metric

In [2]:
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_seq_items',None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

In [3]:
model_checkpoint = 'castorini/monot5-base-msmarco'

In [4]:
train = pd.read_csv('data/train.csv')
dev = pd.read_csv('data/dev.csv')
test = pd.read_csv('data/test.csv')

In [5]:
train.sample(3)

Unnamed: 0,qid,text,output
19045,C_50898b4862fb42f5b989ea579db6fb46_0_q#5,"Question Answering: What were the results from the incident when a female fan claimed that Oliver Sykes of Bring Me the Horizon had urinated on her? <extra_id_0> With a plan to re-record some of the older, classic Visage tracks as well as produce some new material, the project never seemed to fully get off the ground despite some television appearances. The first Visage Mk II song was called ""Diary of A Madman"", which was made available for download in 2007 in return for a donation to the Ch...",false unanswerable
19454,C_52412285718442718f3b3dfb204d9377_0_q#5,"Question Answering: when was the first football bowl game? <extra_id_0> During Bobby Dodd's tenure, Georgia Tech played against several integrated football teams while the South was resisting integration. Georgia Tech played against Notre Dame in 1953 with Wayne Edmonds starting at offensive tackle and defensive end for the Irish. Edmonds was the first black player to win a monogram at Notre Dame. Georgia Tech lost to Notre Dame 27-14. Georgia Tech also participated in the first integrated b...",true CANNOTANSWER
24232,C_6408f479d3b64aff99516b76dc39a478_1_q#2,"Question Answering: Did any of Mick Taylor's bands record any singles or albums? <extra_id_0> Taylor was born to a working-class family in Welwyn Garden City, but was raised in Hatfield, Hertfordshire, England, where his father worked as a fitter (machinist) for the De Havilland aircraft company. He began playing guitar at age nine, learning to play from his mother's younger brother. As a teenager, he formed bands with schoolmates and started performing concerts under names such as The Junio...",true They also appeared on television and put out a single.


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [7]:
raw_datasets = DatasetDict()
raw_datasets['train'] = Dataset.from_pandas(train)
raw_datasets['dev'] = Dataset.from_pandas(dev)
raw_datasets['test'] = Dataset.from_pandas(test)

In [8]:
max_input_length = 512
max_target_length = 256

def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, padding=True, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding=True, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/64 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [10]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
output_path = f"{model_name}-finetuned-monoQA"
# output_path = f"{model_name}-finetuned-monoQA-rep-HDN-CONVDR-Question-Answering-100sample"
# output_path = f"{model_name}-finetuned-monoQA-rep-HDN-CONVDR-Question-Passage-100sample"
args = Seq2SeqTrainingArguments(
    output_path,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
    save_strategy ="epoch",
    logging_dir= output_path + '/logs',
    logging_strategy="epoch",
#     logging_steps=1000,
    load_best_model_at_end=True
#     save_steps=1000,
)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = load_metric("rouge")

In [12]:
def split_tasks(text):
    if text.startswith('true ') or text.startswith('false '):
        return text.split()[0], ' '.join(text.split()[1:])
    else:
        return 'false', text

In [13]:
def compute_metrics_multitask(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [split_tasks(pred) for pred in decoded_preds]
    decoded_labels = [split_tasks(label) for label in decoded_labels]
    
    rel_preds = [m[0] for m in decoded_preds]
    rel_labels = [m[0] for m in decoded_labels]
    
    ans_preds = [m[1] for m in decoded_preds]
    ans_labels = [m[1] for m in decoded_labels]
    
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions[7:]]
    
    result = metric.compute(predictions=ans_preds, references=ans_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    rel_acc = accuracy_score(rel_labels, rel_preds)
    
    result['Rel Acc'] = rel_acc * 100
    
    return {k: round(v, 2) for k, v in result.items()}

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_multitask
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: output, qid, text. If output, qid, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 63052
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 39410


Epoch,Training Loss,Validation Loss
