# installations

In [1]:
!pip install torchmetrics
!pip install transformers[sentencepiece]
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install transformers[torch]
!pip install seqeval
!pip install rouge_score
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.

# imports + mount

In [2]:
import pandas as pd
from google.colab import drive
import os
from ast import literal_eval
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorWithPadding, \
TrainingArguments, Trainer, DataCollatorForTokenClassification, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, \
Seq2SeqTrainer
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Value, Sequence, concatenate_datasets
import numpy as np
import evaluate
from nltk.tokenize import sent_tokenize
import nltk
from torchmetrics.text.rouge import ROUGEScore

In [3]:
# Mount Google Drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/23_spring/CS-263-NLP/final-project/NLP-final-project/implementation/T5')

Mounted at /content/drive


# Data preprocessing

## Utility functions

In [4]:
# takes a list of claims and converts it to a single string
def claim_list_to_string(claim_list):
  output = ""

  for i, claim in enumerate(claim_list):
    if i == 0:
      output += claim + "."
    else:
      output += claim + "."

  return output

# reformat claim column from a claim list to a string
def reformat_claims(dataset):
  claims = []
  for i in range(dataset.shape[0]):
    claims.append(claim_list_to_string(dataset['claims'][i]))

  dataset = dataset.add_column("labels", claims)

  return dataset

# add label column to train and dev datasets
def add_labels(datasets):
  datasets["train"] = reformat_claims(datasets["train"])
  datasets["dev"] = reformat_claims(datasets["dev"])

  return datasets

# add "context: " & "question: " fields to the tweet
def add_input_formatting(tweet):
  output = "Extract the claims: " + tweet
  return output

# add an input field to the dataset that formats the tweets in QA format
def add_formatted_inputs(dataset):
  inputs = []
  for i in range(dataset.shape[0]):
    inputs.append(add_input_formatting(dataset["tweets"][i]))

  dataset = dataset.add_column("inputs", inputs)
  return dataset

## Load tokenizer and datasets

In [5]:
# load tokenizer
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# load datasets
df_train = pd.read_csv("../data/train-T5.csv", converters={'tokens':literal_eval, 'span_start_index':literal_eval, 'span_end_index':literal_eval, 'claims':literal_eval})
df_dev = pd.read_csv("../data/dev-T5.csv", converters={'tokens':literal_eval, 'span_start_index':literal_eval, 'span_end_index':literal_eval, 'claims':literal_eval})

# create a dataset from the dataframes
train_dataset = Dataset.from_dict(df_train)
dev_dataset = Dataset.from_dict(df_dev)

print(train_dataset)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Dataset({
    features: ['tokens', 'claim_label', 'span_start_index', 'span_end_index', 'tweets', 'claims'],
    num_rows: 6044
})


In [6]:
# add labels to datasets
train_dataset = reformat_claims(train_dataset)
dev_dataset = reformat_claims(dev_dataset)

In [7]:
# add QA formatting to tweets
train_dataset = add_formatted_inputs(train_dataset)
dev_dataset = add_formatted_inputs(dev_dataset)

In [8]:
print(train_dataset["inputs"])



## Tokenize and edit dataset

In [46]:
# tokenize datasets
max_input_length = max_target_length = 512

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["labels"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6044 [00:00<?, ? examples/s]

Map:   0%|          | 0/756 [00:00<?, ? examples/s]

In [10]:
print(tokenized_train_dataset)

Dataset({
    features: ['tokens', 'claim_label', 'span_start_index', 'span_end_index', 'tweets', 'claims', 'labels', 'inputs', 'input_ids', 'attention_mask'],
    num_rows: 6044
})


# Training

## Evaluation metrics setup

In [11]:
nltk.download('punkt')
rouge_scorer = ROUGEScore(use_stemmer=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    rouge_scorer = ROUGEScore(use_stemmer=True)
    result = rouge_scorer(
        preds=decoded_preds, target=decoded_labels
    )
    # Extract the median scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # return {k: round(v, 4) for k, v in result.items()}
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
rouge_scorer = ROUGEScore(use_stemmer=True)

# scores = rouge_scorer("i hate this",
#                       "i don't hate this")

scores = rouge_scorer(
    preds=["i hate this", "i do too"], target=["i hate this", "i don't too"]
)
scores
# scores["rouge1"]

{'rouge1_fmeasure': tensor(0.7857),
 'rouge1_precision': tensor(0.8333),
 'rouge1_recall': tensor(0.7500),
 'rouge2_fmeasure': tensor(0.5000),
 'rouge2_precision': tensor(0.5000),
 'rouge2_recall': tensor(0.5000),
 'rougeL_fmeasure': tensor(0.7857),
 'rougeL_precision': tensor(0.8333),
 'rougeL_recall': tensor(0.7500),
 'rougeLsum_fmeasure': tensor(0.7857),
 'rougeLsum_precision': tensor(0.8333),
 'rougeLsum_recall': tensor(0.7500)}

In [None]:
# metric = evaluate.load("seqeval")
# label_names = ["O", "B", "I"]

# # compute evaluation metrics from model predictions
# def compute_metrics(eval_preds):
#   logits, labels = eval_preds
#   predictions = np.argmax(logits, axis=-1)

#   true_labels = [[label_names[l] for l in label if l != -100] for label in labels]

#   true_predictions = [
#         [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#   ]

#   all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

#   return {
#       "precision": all_metrics["overall_precision"],
#       "recall": all_metrics["overall_recall"],
#       "f1": all_metrics["overall_f1"],
#       "accuracy": all_metrics["overall_accuracy"],
#   }

## Splitting the dataset

In [None]:
#split up datasets into train/val/test
# test_size = val_size = 0.111
# all_examples = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["test"]])
# train_test_dataset = all_examples.train_test_split(test_size=test_size)
# train_val_dataset = train_test_dataset["train"].train_test_split(test_size=val_size)
# final_datasets = DatasetDict({"train":train_val_dataset["train"], \
#                               "validation":train_val_dataset["test"], \
#                               "test":train_test_dataset["test"]})

# print(final_datasets)

## Training

In [47]:
batch_size = 8
num_train_epochs = 2
# Show the training loss with every epoch
logging_steps = len(train_dataset) // batch_size

args = Seq2SeqTrainingArguments(
    output_dir="./saved-models/t5-finetune-3-no_qa_formatting",
    evaluation_strategy="no",
    learning_rate=5e-7,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    save_strategy="no"
)

In [14]:
# instantiate the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# must remove text columns
if "tokens" in tokenized_train_dataset.column_names:
  tokenized_train_dataset = tokenized_train_dataset.remove_columns(["tokens", "claim_label", "span_start_index", "span_end_index", "tweets", "claims"])
  tokenized_dev_dataset = tokenized_dev_dataset.remove_columns(["tokens", "claim_label", "span_start_index", "span_end_index", "tweets", "claims"])

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
print(tokenized_train_dataset)

Dataset({
    features: ['labels', 'inputs', 'input_ids', 'attention_mask'],
    num_rows: 6044
})


In [49]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# trainer.train()

## Save model locally

In [28]:
trainer.save_model("./saved-models/t5-finetune-3-no_qa-formatting")

# evaluate

In [50]:
evaluate_results = trainer.evaluate()

In [51]:
for result in evaluate_results:
  print(result + "; " + str(evaluate_results[result]))

eval_loss; 0.25250178575515747
eval_rouge1_fmeasure; 0.7750066518783569
eval_rouge1_precision; 0.8715553283691406
eval_rouge1_recall; 0.7415154576301575
eval_rouge2_fmeasure; 0.7498223185539246
eval_rouge2_precision; 0.8504735827445984
eval_rouge2_recall; 0.7188305854797363
eval_rougeL_fmeasure; 0.7717560529708862
eval_rougeL_precision; 0.8675180077552795
eval_rougeL_recall; 0.7384121417999268
eval_rougeLsum_fmeasure; 0.7719897031784058
eval_rougeLsum_precision; 0.8678272366523743
eval_rougeLsum_recall; 0.7386103868484497
eval_runtime; 47.9105
eval_samples_per_second; 15.779
eval_steps_per_second; 1.983


In [52]:
# make predictions
predict_results = trainer.predict(tokenized_dev_dataset)

In [81]:
for result in predict_results[2]:
  print(result + "; " + str(predict_results[2][result]))

test_loss; 0.25250178575515747
test_rouge1_fmeasure; 0.7750066518783569
test_rouge1_precision; 0.8715553283691406
test_rouge1_recall; 0.7415154576301575
test_rouge2_fmeasure; 0.7498223185539246
test_rouge2_precision; 0.8504735827445984
test_rouge2_recall; 0.7188305854797363
test_rougeL_fmeasure; 0.7717560529708862
test_rougeL_precision; 0.8675180077552795
test_rougeL_recall; 0.7384121417999268
test_rougeLsum_fmeasure; 0.7719897031784058
test_rougeLsum_precision; 0.8678272366523743
test_rougeLsum_recall; 0.7386103868484497
test_runtime; 48.2317
test_samples_per_second; 15.674
test_steps_per_second; 1.97


## Save incorrect examples

In [82]:
# takes input examples and prediction results --> returns df of incorrectly predicted examples
def return_incorrect_examples(input_dataset, predict_results):
  predictions, labels = predict_results[0], predict_results[1]
  # Decode prediction into text
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  # Decode labels into text
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # find indices where predictions aren't the same as labels
  decoded_labels = np.array(decoded_labels)
  decoded_preds = np.array(decoded_preds)
  mismatch_indices = np.where(decoded_preds != decoded_labels)

  tweets = np.array(input_dataset["tweets"])
  return pd.DataFrame({"tweets":tweets[mismatch_indices], "labels":decoded_labels[mismatch_indices], "predictions":decoded_preds[mismatch_indices]})

In [85]:

result = return_incorrect_examples(dev_dataset, predict_results)

result.to_csv('../data/incorrectly-classified/t5-finetune-3-no_qa_formatting.csv', index=False)
# print(dev_dataset["tweets"][6])
# # print(predict_results)

In [77]:
print(dev_dataset)

Dataset({
    features: ['tokens', 'claim_label', 'span_start_index', 'span_end_index', 'tweets', 'claims', 'labels', 'inputs'],
    num_rows: 756
})
