In [1]:
import pandas as pd

filename = '../../data/BioPubSum/BioPubSumm_test_fill.xlsx'
df = pd.read_excel(filename)
abstracts = df['Abstract'].tolist()
raw_highlights = df['New_Highlight'].tolist()

#### simple inference

In [None]:
import torch
import torch.nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import rouge

r_computer = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], limit_length=False, max_n=2, alpha=0.5, stemming=False)

checkpoints = [
    'google/flan-t5-base',
    'google/flan-t5-large',
    'facebook/bart-large',
    't5-base',
    't5-large'
]
tokenizer = AutoTokenizer.from_pretrained(checkpoints[3])
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoints[3])

In [3]:
task_prefixs = [
    'Summarize: ',
    'Summarize the higlight: ',
    'Summarize the mainly work: ',
    'Extract the highlight: ',
    'Extract the mainly work: ',
]

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return self.inputs['input_ids'].shape[0]

    def __getitem__(self, idx):
        return self.inputs['input_ids'][idx], self.inputs['attention_mask'][idx]

In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

* based on model.generate

In [6]:
task_prefix = task_prefixs[0]
texts = [task_prefix + abstract for abstract in abstracts]

inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [7]:
import torch

infer_data = MyDataset(inputs)
infer_dataloader = DataLoader(infer_data, batch_size=32)

In [None]:
from tqdm import tqdm

epochs = 4
rouge_1, rouge_2, rouge_l = [], [], []

for epoch in range(epochs):
    generate_contexts = []
    for i, inputs in enumerate(tqdm(infer_dataloader)):
        input_ids, attention_mask = inputs
        output_ids = model.generate(
            input_ids = input_ids.to(device),
            attention_mask = attention_mask.to(device),
            do_sample=True,
            top_k=100,
            top_p=0.92,
            temperature=0.9,
            repetition_penalty=1.5,
            max_new_tokens=120
        )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        generate_contexts.extend(outputs)
        
    scores = r_computer.get_scores(generate_contexts, raw_highlights)
    rouge_1.append(scores['rouge-1']['f'])
    rouge_2.append(scores['rouge-2']['f'])
    rouge_l.append(scores['rouge-l']['f'])

In [140]:
import numpy as np

print(np.mean(np.array(rouge_1)))
print(np.mean(np.array(rouge_2)))
print(np.mean(np.array(rouge_l)))

0.20162738575411265
0.053923577833063854
0.1784105552528071
