# Indonesian News Text Summarization Using BART Model

In [1]:
import numpy as np
import pandas as pd
import nltk 
import torch 
import transformers
import datasets
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
torch.cuda.is_available()

True

In [4]:
max_input = 512
max_target = 128
batch_size = 3

### I. Data Preparation

The dataset used for this project is the IndoSum Dataset.

In [5]:
import json
import os
import glob
from datasets import Dataset

In [6]:
def load_data(file_paths):
    articles = []
    summaries = []
    for file_path in file_paths:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        with open(file_path, "r", encoding = "utf-8") as f:
            for line in f:
                data = json.loads(line)
                article_text = " ".join([" ".join(sentence) for paragraph in data["paragraphs"] for sentence in paragraph])
                summary_text = " ".join([" ".join(sentence) for sentence in data["summary"]])

                articles.append(article_text)
                summaries.append(summary_text)
    return articles, summaries

In [8]:
base_path = "../data/indosum/indosum"

train_files = sorted(glob.glob(os.path.join(base_path, "train.0[1-5].jsonl")))
dev_files = sorted(glob.glob(os.path.join(base_path, "dev.0[1-5].jsonl")))
test_files = sorted(glob.glob(os.path.join(base_path, "test.0[1-5].jsonl")))

train_articles, train_summaries = load_data(train_files)
dev_articles, dev_summaries = load_data(dev_files)
test_articles, test_summaries = load_data(test_files)

train_dataset = Dataset.from_dict({"document": train_articles[:400], "summary": train_summaries[:400]})
val_dataset = Dataset.from_dict({"document": dev_articles[400:450], "summary": dev_summaries[400:450]})
test_dataset = Dataset.from_dict({"document": test_articles[450:500], "summary": test_summaries[450:500]})

print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['document', 'summary'],
    num_rows: 400
})
Dataset({
    features: ['document', 'summary'],
    num_rows: 50
})
Dataset({
    features: ['document', 'summary'],
    num_rows: 50
})


### II. Data Preprocessing

The preprocessing technique used for this project is the BART tokenizer. The BART tokenizer is a subword tokenizer used with the BART (Bidirectional and Auto-Regressive Transformer) model. It is based on Byte-Pair Encoding (BPE) and uses SentencePiece to handle tokenization. The tokenizer work as follows:
- Step 1: Preprocessing
    - The input text is lowercased and normalized (handles Unicode characters, punctuation, and spacing).
    - It can process unseen words using subword tokenization.
- Step 2: Tokenization (Subword Splitting)
    - The tokenizer breaks words into subwords using Byte-Pair Encoding (BPE).
    - Common words remain whole ("hello" → ["hello"]), while rare words split into subwords ("unhappiness" → ["un", "happiness"]).
- Step 3: Convert Tokens to IDs
    - Each token (or subword) is mapped to a unique integer ID from the vocabulary.
    - Example:
        - "Hello World"
        - tensor([[    0,  31414,   232,     2]])
- Step 4: Special Tokens
    - BART uses special tokens for sequence modeling:
        - ```<s>``` (Start of sentence)
        - ```</s>``` (End of sentence)
        - ```<mask>``` (Masked token for denoising pretraining)
        - ```<pad>``` (Padding token for batching)
- Step 5: Decoding (Reverse Tokenization)
    - The model generates output as token IDs, which the tokenizer converts back to human-readable text.
    - Example:
        - tensor([[    0,  31414,   232,     2]])
        - "Hello World"

In [9]:
from transformers import AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [11]:
def preprocess_data(data_to_process):
  inputs = [dialogue for dialogue in data_to_process["document"]]

  model_inputs = tokenizer(inputs,  max_length = max_input, padding = "max_length", truncation = True)

  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process["summary"], max_length = max_target, padding = "max_length", truncation = True)
    
  model_inputs["labels"] = targets["input_ids"]

  return model_inputs

In [12]:
train_dataset = train_dataset.map(preprocess_data, batched = True)
val_dataset = val_dataset.map(preprocess_data, batched = True)
test_dataset = test_dataset.map(preprocess_data, batched = True)

Map: 100%|██████████| 400/400 [00:00<00:00, 2719.10 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3008.48 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 1806.41 examples/s]


### III. Modeling and Evaluation

The model used for this project is the BART model. BART is a transformer model introduced by Facebook AI, that combines bidirectional and autoregressive transformers. BART uses encoder-decoder architecture that is essential for tasks involving sequences of events, such as summarization. The bidirectional approach allows the model to capture contextual information, understanding, and representing input text from both directions. Meanwhile, the autoregressive approach allows the model to create coherent and contextually rich abstractive summaries.

![](https://production-media.paperswithcode.com/methods/Screen_Shot_2020-06-01_at_9.49.47_PM.png)

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [14]:
metric = evaluate.load("rouge")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = metric.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer = True)

    result = {key: value * 100 for key, value in result.items()}  

    return result

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [17]:
training_args = Seq2SeqTrainingArguments(
    "../models/bart-v2", 
    evaluation_strategy = "steps",
    save_steps = 100,
    eval_steps = 100,    
    logging_steps = 10,
    warmup_steps = 500,    
    learning_rate = 2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 1,
    weight_decay = 0.01,
    save_total_limit = 2,
    num_train_epochs = 3,
    predict_with_generate = True,
    eval_accumulation_steps = 1,
    fp16 = True   
)



In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model)

In [19]:
trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

  trainer = Seq2SeqTrainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,1.2538,0.593706,69.50626,62.686629,67.468269,67.220292
200,0.6766,0.539372,69.414499,63.093253,67.636876,67.479265
300,0.55,0.562498,67.590426,61.283295,65.789854,65.56398
400,0.4559,0.527855,68.851669,62.566604,67.281835,66.957766
500,0.39,0.553991,69.488487,61.867581,67.478255,67.27757
600,0.3053,0.526198,72.29663,64.842544,70.355759,70.082403
700,0.4977,0.515091,72.490772,65.555322,70.832354,70.595559
800,0.5858,0.528015,72.856121,65.351684,70.887255,70.695226
900,0.3895,0.515244,73.660304,65.727714,71.743835,71.512206
1000,0.286,0.518887,73.477913,65.942303,71.620595,71.370887




TrainOutput(global_step=1200, training_loss=0.45249420702457427, metrics={'train_runtime': 2220.2933, 'train_samples_per_second': 0.54, 'train_steps_per_second': 0.54, 'total_flos': 1300262761267200.0, 'train_loss': 0.45249420702457427, 'epoch': 3.0})

In [21]:
test_results = trainer.evaluate(test_dataset)

In [22]:
print(test_results)

{'eval_loss': 0.39869368076324463, 'eval_rouge1': 71.62222408857505, 'eval_rouge2': 64.4430412437017, 'eval_rougeL': 69.19834755752343, 'eval_rougeLsum': 68.7715976250652, 'eval_runtime': 108.6679, 'eval_samples_per_second': 0.46, 'eval_steps_per_second': 0.46, 'epoch': 3.0}


In [23]:
trainer.save_model("../models/bart-v2")
tokenizer.save_pretrained("../models/bart-v2")

('../models/bart-v2\\tokenizer_config.json',
 '../models/bart-v2\\special_tokens_map.json',
 '../models/bart-v2\\vocab.json',
 '../models/bart-v2\\merges.txt',
 '../models/bart-v2\\added_tokens.json',
 '../models/bart-v2\\tokenizer.json')

In [None]:
sample = train_dataset[4]

print(sample["document"])

print(sample["summary"])

Merdeka.com - Indonesia Corruption Watch ( ICW ) meminta Komisi Pemberantas   Korupsi ( KPK ) ikut memantau perkembangan atas meninggalnya saksi kunci kasus mega korupsi e - KTP , Johannes Marliem . Peneliti ICW Divisi Hukum dan Monitoring Peradila , Aradila Caesar mengatakan momentum meninggalnya saksi kunci tersebut menimbulkan kejanggalan dan tanda tanya besar . " Orang meninggal kita kan tidak bisa prediksi itu bukan kuasa kita . Tapi kalau kita melihat momentum kan ada suatu kejanggalan . Kenapa momentum meninggalnya , saat kasus e - ktp sedang ditangani oleh KPK , " katanya seusai konferensi pers di Kantor Sekeretariatan ICW ,   Jakarta , Minggu(13 / 8 ) . Pihak ICW meminta KPK turut menyelidiki kematian saksi kunci ini dan menjelaskan kepada masyarakat apakah ada keterkaitan dengan permasalahan korupsi e - KTP atau hal-hal lain dibalik kematian Johannes . " Kita minta KPK dan juga bekerja sama dengan pihak otoritas untuk menyelidiki kematian dari saksi kunci tersebut dengan seri