In [34]:
# !pip install evaluate rouge_score absl-py

In [35]:
import numpy as np
from datasets import load_dataset


news = load_dataset("argilla/news-summary", split="test")
df = news.to_pandas().sample(500, random_state=42)[["text", "prediction"]]

In [36]:
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])

In [37]:
### 60%, 20%, 20% 로 데이터 분할
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

In [38]:
test

Unnamed: 0,text,prediction
4373,BUDAPEST (Reuters) - A Hungarian European Parl...,Hungary charges Jobbik MEP with spying on EU f...
19455,WASHINGTON (Reuters) - The U.S. Air Force aske...,U.S. Air Force asks industry for proposals to ...
17237,(Reuters) - A third of Republican voters who s...,Exclusive: Blocking Trump could hurt Republica...
10842,"BAGHDAD/ERBIL, Iraq (Reuters) - Opposition gro...",Opposition groups quit Iraqi Kurdish governmen...
18837,CLEVELAND/NEW YORK - Bracing for a general ele...,Democrats gird for fight with Trump in U.S. Ru...
...,...,...
12489,NEW YORK (Reuters) - Prospects that the presid...,U.S. options market not very 'Trumped up' ahea...
4618,(Reuters) - Highlights for U.S. President Dona...,Highlights: The Trump presidency on April 13 a...
3062,WASHINGTON (Reuters) - U.S. lawmakers this wee...,"A year later, U.S. lawmakers still take aim at..."
1272,"BRIGHTON, England (Reuters) - Britain s opposi...",UK's Labour pledges infrastructure nationaliza...


In [39]:
print(f"Source News : {train.text.iloc[0][:200]}")          # 길이 200
print(f"Summarization : {train.prediction.iloc[0][:50]}")   # 길이 50
print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")

Source News : GAZA (Reuters) - A decade on, Rawda al-Zaanoun is at last willing to forgive the  gunmen who killed her son during the civil war that split Palestine. It has been painful, but she says it is time.  He
Summarization : How blood money, diplomacy and desperation are reu
Training Data Size : 300
Validation Data Size : 100
Testing Data Size : 100


In [40]:
import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

In [41]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(  # tokenized : transformers.tokenization_utils_base.BatchEncoding
        text=data.text.tolist(),    # data 안의 text를 list로 변환
        padding="longest",          # 가장 긴 시퀀스의 길이에 맞춰 패딩을 추가
        truncation=True,            # 입력 데이터가 모델의 최대입력길이를 초과하는 경우 자르는 작업
        return_tensors="pt",        # 토큰화된 결과를 pytorch tensor로 반환
        max_length=256              # transformers 4.37.2 에서는 없어도 자동으로 길이고정됨. transformers 4.40.0 에서는 max_length를 줘야 고정됨
    )   # tokenized : 'input_ids', 'attention_mask' 로 구성 (둘 다 tensor이고, shape : [3000, 3913])
    labels = []
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    for target in data.prediction:
        ### 요약 문자열을 토큰화, 정수화 하여 텐서로 저장
        labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
    ### 입력으로 주어진 시퀀스 중 가장 긴 길이에 맞춰서 패딩을 수행한다. (shape : [30])
    labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

def get_datalodader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

In [42]:
### 에폭, 배치사이즈, 디바이스, 토크나이저
epochs = 3
batch_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
)

In [43]:
type(tokenizer)

transformers.models.bart.tokenization_bart.BartTokenizer

In [44]:
train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

(tensor([    0,  4164, 22447,    36,  1251,    43,   111,    83,  2202,    15,
            6,  8214,  6106,  1076,    12,  1301, 23081,  7928,    16,    23,
           94,  2882,     7, 20184,     5,  1437, 18282,    54,   848,    69,
          979,   148,     5,  2366,   997,    14,  3462, 16398,     4,    85,
           34,    57,  8661,     6,    53,    79,   161,    24,    16,    86,
            4,  1437,    91,    21,   478,    19,    10,  8894,    11,     5,
          124,     4,    91,    21,    10, 26301,     6,  1437,     5,  4431,
           12,   180,    12,   279,    26,    23,    41,   515,    11,  7914,
          343,     7,  2458,     5,   285, 13229,     9,  1232,     9,    82,
          848,    11,     5,   997,     4,  1437,    20,   568,    21,    45,
         1365,   142,     5,  1925,     9,    84,   979,    16,  9761,     4,
          125,    52,    33,   576, 23306,     4,  1405,   979, 18226,     6,
           10,  2997,  1150,     9,    80,     8,    41,  1036,

In [45]:
x, y, z = train_dataset[0]
print(x.shape)  
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([27])


In [46]:
len(train_dataset[0][0])

256

In [47]:
x, y, z = valid_dataset[0]
print(x.shape)
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([25])


In [48]:
x, y, z = test_dataset[0]
print(x.shape)
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([26])


In [49]:
from torch import optim
from transformers import BartForConditionalGeneration


model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

In [50]:
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)

model
└ shared
└ encoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
└ decoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
lm_head


In [51]:
import numpy as np
import evaluate
import rouge_score

In [52]:
def calc_rouge(preds, labels):
    preds = preds.argmax(axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge2 = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    return rouge2["rouge2"]

In [53]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        # print(f'input_ids => {input_ids.shape}')
        # print(f'attention_mask => {attention_mask.shape}')
        # print(f'labels => {labels.shape}')
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

In [54]:
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss, val_rouge = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            logits = outputs.logits
            loss = outputs.loss

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            rouge = calc_rouge(logits, label_ids)
            
            val_loss += loss
            val_rouge += rouge

    val_loss = val_loss / len(dataloader)
    val_rouge = val_rouge / len(dataloader)
    return val_loss, val_rouge

In [55]:
from tqdm import tqdm

In [57]:
rouge_score = evaluate.load("rouge", tokenizer=tokenizer)
best_loss = 10000
for epoch in tqdm(range(epochs)):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Rouge {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "BartForConditionalGeneration.pt")
        print("Saved the model weights")

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1: Train Loss: 1.6907 Val Loss: 1.7039 Val Rouge 0.2861


 33%|███▎      | 1/3 [04:49<09:38, 289.41s/it]

Saved the model weights


 67%|██████▋   | 2/3 [10:17<05:12, 312.14s/it]

Epoch 2: Train Loss: 1.2185 Val Loss: 1.8984 Val Rouge 0.2534


 67%|██████▋   | 2/3 [13:54:49<6:57:24, 25044.99s/it]


KeyboardInterrupt: 

In [None]:
# model = BartForConditionalGeneration.from_pretrained(
#     pretrained_model_name_or_path="facebook/bart-base"
# ).to(device)
# model.load_state_dict(torch.load("../models/BartForConditionalGeneration.pt"))

test_loss, test_rouge_score = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test ROUGE-2 Score : {test_rouge_score:.4f}")

In [None]:
from transformers import pipeline


summarizer = pipeline(
    task="summarization",
    model=model,
    tokenizer=tokenizer,
    max_length=54,
    device="cpu"
)

for index in range(5):
    news_text = test.text.iloc[index]
    summarization = test.prediction.iloc[index]
    predicted_summarization = summarizer(news_text)[0]["summary_text"]
    print(f"정답 요약문 : {summarization}")
    print(f"모델 요약문 : {predicted_summarization}\n")