In [1]:
import pandas as pd

filename = '../../data/BioPubSum/BioPubSumm_test_fill.xlsx'
df = pd.read_excel(filename)
abstracts = df['Abstract'].tolist()
raw_highlights = df['New_Highlight'].tolist()

In [2]:
import torch
import torch.nn
from transformers import AutoTokenizer, GPT2LMHeadModel
import rouge

r_computer = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], limit_length=False, max_n=2, alpha=0.5, stemming=False)

checkpoints = [
    'gpt2'
]
tokenizer = AutoTokenizer.from_pretrained(checkpoints[0])
model = GPT2LMHeadModel.from_pretrained(checkpoints[0])

In [3]:
# when inference, padding_side set left
# when training, padding_side set right
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [4]:
task_prefixs = [
    'Summarize: ',
    'Summarize the higlight: ',
    'Summarize the mainly work: ',
    'Extract the highlight: ',
    'Extract the mainly work: ',
]

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return self.inputs['input_ids'].shape[0]

    def __getitem__(self, idx):
        return self.inputs['input_ids'][idx], self.inputs['attention_mask'][idx]

#### generate

In [6]:
task_prefix = task_prefixs[2]
texts = [task_prefix + abstract for abstract in abstracts]

inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [7]:
infer_data = MyDataset(inputs)
infer_dataloader = DataLoader(infer_data, batch_size=8)

In [None]:
from tqdm import tqdm

epochs = 4
rouge_1, rouge_2, rouge_l = [], [], []

for epoch in range(epochs):
    generate_contexts = []
    for i, inputs in enumerate(tqdm(infer_dataloader)):
        input_ids, attention_mask = inputs
        output_ids = model.generate(
            input_ids = input_ids.to(device),
            attention_mask = attention_mask.to(device),
            do_sample=True,
            top_k=100,
            top_p=0.92,
            temperature=0.9,
            repetition_penalty=1.5,
            max_new_tokens=100,
            pad_token_id = tokenizer.eos_token_id
        )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        generate_contexts.extend(outputs)
        
    scores = r_computer.get_scores(generate_contexts, raw_highlights)
    rouge_1.append(scores['rouge-1']['f'])
    rouge_2.append(scores['rouge-2']['f'])
    rouge_l.append(scores['rouge-l']['f'])

In [27]:
import numpy as np

print(np.mean(np.array(rouge_1)))
print(np.mean(np.array(rouge_2)))
print(np.mean(np.array(rouge_l)))

0.21835580584029676
0.09855973700910847
0.18294217874379967


#### prompt learning

In [12]:
new_contents = [content + prompt for content in contents]

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

inputs = tokenizer.batch_encode_plus(new_contents, return_tensors='pt', padding=True)

In [14]:
generate_ids = model.generate(
    input_ids = inputs['input_ids'].to(device),
    attention_mask = inputs['attention_mask'].to(device),
    do_sample=True,
    top_k=100,
    top_p=0.92,
    repetition_penalty=1.5,
    max_new_tokens=100,
    early_stopping=True,
    pad_token_id = tokenizer.eos_token_id
)

In [15]:
generate_ids.shape

torch.Size([65, 658])

In [None]:
torch.where(generate_ids[0]==tokenizer.eos_token_id)

In [22]:
generate_texts = []

for i in range(generate_ids.shape[0]):
    id = generate_ids[i]
    generate_texts.append(tokenizer.decode(id[-100:], skip_special_tokens=True))