In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
%%capture
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs
!pip install rouge_score

In [8]:
!wget https://raw.githubusercontent.com/vietai/ViT5/main/data/vietnews/test.tsv

Copying gs://vie_projects/data/vietnews/test.tsv...
| [1 files][ 66.1 MiB/ 66.1 MiB]                                                
Operation completed over 1 objects/66.1 MiB.                                     


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-large-vietnews-summarization")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-large-vietnews-summarization")
model.to('cuda')

In [2]:
from datasets import load_metric

metric = load_metric("rouge")


In [3]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=1024, truncation=True, padding=True
    )
    
    labels = tokenizer(
        examples["labels"], max_length=256, truncation=True, padding=True
    )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [4]:
input_lines = []
label_lines = []
with open('test_dedup.tsv') as file:
  for line in file:
    line = line.strip().split('\t')
    input = "vietnews: " + line[0]
    input_lines.append(input)
    label_lines.append(line[1])



input_lines  = input_lines
label_lines = label_lines
dict_obj = {'inputs': input_lines, 'labels': label_lines}

dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

           

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/3 [00:00<?, ?ba/s]

In [5]:
import torch 
import numpy as np
metrics = load_metric('rouge')

max_target_length = 256
dataloader = torch.utils.data.DataLoader(tokenized_datasets, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()


  0%|          | 0/704 [00:00<?, ?it/s]

{'rouge1': AggregateScore(low=Score(precision=0.6608337780624861, recall=0.6296166098138688, fmeasure=0.633827313959852), mid=Score(precision=0.6627444766484762, recall=0.6314814876600445, fmeasure=0.6353820985989678), high=Score(precision=0.6645141322533406, recall=0.6335088265551201, fmeasure=0.6371190182439236)),
 'rouge2': AggregateScore(low=Score(precision=0.35423924343247803, recall=0.3396981724884984, fmeasure=0.3412002894085), mid=Score(precision=0.3573770390158726, recall=0.3428770644221921, fmeasure=0.3443803115293873), high=Score(precision=0.3602301060270445, recall=0.3457251750865725, fmeasure=0.34715116190626094)),
 'rougeL': AggregateScore(low=Score(precision=0.45151608626097145, recall=0.4320029643329553, fmeasure=0.43420077604262286), mid=Score(precision=0.4541747495105606, recall=0.43452591527291656, fmeasure=0.4367163464547867), high=Score(precision=0.4567788557435788, recall=0.4372848879130819, fmeasure=0.43930353399696503)),
 'rougeLsum': AggregateScore(low=Score(pr

In [6]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=label_lines).items()]


[{'rouge1': 0.6341253633609143},
 {'rouge2': 0.3426844550733291},
 {'rougeL': 0.43581389091365286},
 {'rougeLsum': 0.4358350606355134}]