In [2]:
"""뉴스 요약 데이터세트 불러오기"""

import numpy as np
from datasets import load_dataset

news = load_dataset("argilla/news-summary",split="test")
df = news.to_pandas().sample(5000, random_state=42)[["text","prediction"]]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*(len(df))), int(0.8*len(df))]
)

print(f"Source News : {train.text.iloc[0][:200]}")
print(f"Summarization : {train.prediction.iloc[0][:50]}")
print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Tesing Data Size : {len(test)}")

Found cached dataset parquet (/Users/yeeun/.cache/huggingface/datasets/argilla___parquet/argilla--news-summary-46ccad7a40bceec1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Source News : DANANG, Vietnam (Reuters) - Russian President Vladimir Putin said on Saturday he had a normal dialogue with U.S. leader Donald Trump at a summit in Vietnam, and described Trump as civil, well-educated
Summarization : Putin says had useful interaction with Trump at Vi
Training Data Size : 3000
Validation Data Size : 1000
Tesing Data Size : 1000


In [3]:
df

Unnamed: 0,text,prediction
18006,WASHINGTON (Reuters) - President Barack Obama ...,Obama did not indicate preference for Democrat...
16987,BEIJING (Reuters) - Chinese educational servic...,China's RYB Education fires head of Beijing ki...
6586,CARACAS (Reuters) - Venezuelan President Nicol...,Venezuela's Maduro approval rises to 23 percen...
14737,JOHANNESBURG (Reuters) - Zimbabweans living in...,Zimbabweans in South Africa hope for change at...
856,"Karangasem, INDONESIA (Reuters) - Fears that a...",Bali's rumbling volcano spurs travel warnings ...
...,...,...
15547,JOHANNESBURG (Reuters) - South Africa s ruling...,South Africa's ANC needs to put an end to scan...
3855,WASHINGTON (Reuters) - President Donald Trump ...,"Despite recusal, Trump has confidence in Sessi..."
9734,WASHINGTON/UNITED NATIONS (Reuters) - U.S. Pre...,Trump threatens to cut aid to U.N. members ove...
4113,WASHINGTON (Reuters) - U.S. President Donald T...,Trump's call for military buildup hits bump in...


In [4]:
"""BART 입력 텐서 생성"""

import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding = "longest",
        truncation = True,
        return_tensors = "pt"
    )
    labels = []
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    for target in data.prediction:
        labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
    labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device) #교차엔트로피와 같은 손실함수에서 패딩된 토큰을 무시하게 하기 위해 사용
    return TensorDataset(input_ids, attention_mask, labels)

def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

epochs = 3
batch_size = 8
device = "mps" if torch.backends.mps.is_available() else "cpu"
tokenizer = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
)

train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size) #무작위로 샘플링

valid_dataset = make_dataset(train, tokenizer, device)
valid_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size) #고정된 순서대로 샘플링

test_dataset = make_dataset(train, tokenizer, device)
test_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

(tensor([   0,  495, 1889,  ...,    1,    1,    1], device='mps:0'), tensor([1, 1, 1,  ..., 0, 0, 0], device='mps:0'), tensor([    0, 35891,   161,    56,  5616, 10405,    19,   140,    23,  5490,
         3564,     2,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
       device='mps:0'))


In [5]:
"""BART 모델 선언"""

from torch import optim
from transformers import BartForConditionalGeneration #조건부 생성 클래스

model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base" #빠른 학습을 위해 6개의 계층을 사용하는 bart-base 사용, 12개면 bart-large
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

In [6]:
"""BART 모델 구조 출력"""


from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="facebook/bart-base").to(device)
#사전 학습된 모델(pretrained_model_name_or_path)을 from_pretrained 메서드로 불러옴

for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└",sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └",sssub_name)

# shared 계층은 인코더와 디코더가 공유하는 임베딩 계층 -> 이러한 공유로 인코더와 디코더 간의 연결을 강화
# 인코더의 마지막 계층의 출력값은 디코더의 모든 계층과 어텐션 연산을 수행
# 마지막 디코더 계층의 출력값은 출력 크기가 단어 사전의 크기인 완전 연결 계층을 통과해 언어 모델을 형성

model
└ shared
└ encoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
└ decoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
lm_head


In [7]:
"""BART 모델 학습 및 검증"""

import numpy as np
import evaluate

def calc_rouge(preds, labels):
    preds = preds.argmax(axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge2 = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    return rouge2["rouge2"]


def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss, val_rouge = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits
            loss = outputs.loss

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            rouge = calc_rouge(logits, label_ids)

            val_loss += loss
            val_rouge += rouge

    val_loss = val_loss/len(dataloader)
    val_rouge = val_rouge/len(dataloader)
    return val_loss, val_rouge

rouge_score = evaluate.load("rouge", tokenizer=tokenizer)

best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch:{epoch+1}, Train loss:{train_loss: .4f}, Val loss : {val_loss:.4f}, Val accuracy: {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(),"models/BartForConditionalGeneration.pt")
        print("Saved the model weights")

In [None]:
"""BART 모델 평가"""

model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook.bart-base"
).to(device)
model.load_state_dict(torch.load("models/BartForConditionalGeneration.pt"))

test_loss,test_rouge_score = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss : .4f}")
print(f"Test ROUGE-2 Score : {test_rouge_score : .4f}")

In [None]:
"""문장 요약문 비교"""
from transformers import Pipeline

summarizer = Pipeline(
    task = "summarization",
    model = model,
    tokenizer = tokenizer,
    max_length = 54,
    device = "cpu"
)

for index in range(5):
    news_text = test.text.iloc[index]
    summarization = test.prediction.iloc[index]
    prediction_summarization = summarizer(news_text)[0]["summary_text"]
    print(f"정답 요약문 : {summarization}")
    print(f"모델 요약문 : {prediction_summarization}\n")