
# ðŸ‡®ðŸ‡© IndoBERT Extractive Summarization (Simple Fine-Tuning)

Model base: **`indobenchmark/indobert-base-p2`** (BERT encoder).  
Kita akan fine-tune **kelasifikasi biner per-kalimat**: `1 = penting`, `0 = tidak penting`, lalu menyusun ringkasan dengan memilih kalimat yang diprediksi penting.

> Cocok bila kamu sudah punya dataset `Benchmark.csv` dengan kolom:
> - **text**: dokumen penuh
> - **summary**: ringkasan referensi (untuk weak/heuristic labeling & evaluasi)


In [None]:

# If running on Colab, uncomment:
# !pip -q install -U transformers datasets evaluate scikit-learn pandas numpy sacrebleu rouge-score nltk
# (Optional) Faster tokenization:
# !pip -q install -U tokenizers

import os, random, math, re, json, collections
import numpy as np
import pandas as pd

import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, Trainer, TrainingArguments, set_seed)

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import evaluate  # for ROUGE & BLEU
# import nltk; nltk.download('punkt')  # if you prefer nltk sentence tokenizer

print('torch', torch.__version__)


In [None]:

MODEL_NAME = "indobenchmark/indobert-base-p2"
CSV_PATH   = "Benchmark.csv"   # <-- path file: wajib ada kolom: text, summary

# Tokenization / batching
MAX_LEN    = 128
BATCH_TRAIN = 16
BATCH_EVAL  = 32
EPOCHS      = 3
LR          = 5e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

# Sentence selection
THRESH_KEEP    = 0.5   # threshold probability for label=1
MAX_SUM_SENT   = 5     # max kalimat yang diambil saat inferensi

SEED = 42
set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:

SENT_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')

def split_sentences(text: str):
    # Simple regex-based splitter; tweak as needed
    # Filters very short "sentences"
    parts = [s.strip() for s in SENT_SPLIT_RE.split(str(text)) if s and len(s.split()) > 3]
    return parts

def normalize_for_overlap(s: str):
    return re.sub(r'[^0-9a-zA-Z]+', ' ', s.lower()).strip()

def compute_overlap_ratio(sent: str, summary: str) -> float:
    a = set(normalize_for_overlap(sent).split())
    b = set(normalize_for_overlap(summary).split())
    if not a or not b: 
        return 0.0
    return len(a & b) / max(1, len(a))

def weak_labels_from_summary(text: str, summary: str, thr: float = 0.30):
    sents = split_sentences(text)
    labels = []
    for s in sents:
        r = compute_overlap_ratio(s, summary)
        labels.append(1 if r > thr else 0)
    return sents, labels


In [None]:

df = pd.read_csv(CSV_PATH)
assert {'text','summary'}.issubset(set(df.columns)), "CSV must contain 'text' and 'summary' columns"
print(df.shape, df.columns.tolist())
df.head(2)


In [None]:

rows = []
for i, row in df.iterrows():
    text, summ = str(row['text']), str(row['summary'])
    sents, labels = weak_labels_from_summary(text, summ, thr=0.30)
    for s, y in zip(sents, labels):
        rows.append({'doc_id': i, 'sentence': s, 'label': int(y)})

sent_df = pd.DataFrame(rows)
print("Expanded sentence-level samples:", sent_df.shape)
sent_df['label'].value_counts(normalize=True).rename('ratio').to_frame().T


In [None]:

train_df, val_df = train_test_split(sent_df, test_size=0.2, random_state=SEED, stratify=sent_df['label'])
len(train_df), len(val_df)


In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_batch(batch):
    return tokenizer(batch["sentence"], truncation=True, padding=False, max_length=MAX_LEN)

ds_train = Dataset.from_pandas(train_df, preserve_index=False)
ds_val   = Dataset.from_pandas(val_df,   preserve_index=False)

ds_train = ds_train.map(tokenize_batch, batched=True, remove_columns=['doc_id','sentence'])
ds_val   = ds_val.map(tokenize_batch, batched=True, remove_columns=['doc_id','sentence'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
ds_train, ds_val


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device);


In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [None]:

run_name = "indobert-extractive-sum"
args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:

metrics = trainer.evaluate()
metrics


In [None]:

rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

def predict_sentence_probs(sentences, model, tokenizer):
    enc = tokenizer(sentences, truncation=True, max_length=MAX_LEN, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1)[:,1]  # prob(label=1)
    return probs.cpu().numpy()

def summarize_extractively(text, model, tokenizer, topk=MAX_SUM_SENT, threshold=THRESH_KEEP):
    sents = split_sentences(text)
    if not sents:
        return ""
    probs = predict_sentence_probs(sents, model, tokenizer)
    idx_sorted = np.argsort(-probs)  # descending
    keep = [i for i in idx_sorted if probs[i] >= threshold]
    if not keep:
        keep = idx_sorted[:min(topk, len(sents))]  # fallback
    # keep sentence order as in original text
    keep_sorted_by_order = sorted(keep[:topk])
    return " ".join([sents[i] for i in keep_sorted_by_order])

# quick ROUGE/BLEU on small subset of original docs
subset_n = min(20, len(df))
preds = []
refs  = []
for i in range(subset_n):
    preds.append(summarize_extractively(df.loc[i, 'text'], model, tokenizer, topk=MAX_SUM_SENT, threshold=THRESH_KEEP))
    refs.append(df.loc[i, 'summary'])

rouge_res = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
bleu_res  = bleu.compute(predictions=preds, references=[[r] for r in refs])

print("ROUGE:", {k: round(v, 4) for k, v in rouge_res.items()})
print("BLEU :", round(bleu_res['score'], 4))


In [None]:

i_demo = 0
print("Original text (first 300 chars)\n---")
print(str(df.loc[i_demo, 'text'])[:300], "...\n")
print("Reference summary\n---")
print(df.loc[i_demo, 'summary'], "\n")
print("Generated (extractive)\n---")
print(summarize_extractively(df.loc[i_demo, 'text'], model, tokenizer))


In [None]:

SAVE_DIR = "./indobert-extractive-sum-model"
os.makedirs(SAVE_DIR, exist_ok=True)
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved to", SAVE_DIR)

# Usage example:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# tok = AutoTokenizer.from_pretrained(SAVE_DIR)
# mdl = AutoModelForSequenceClassification.from_pretrained(SAVE_DIR).to(device)
# summarize_extractively(text, mdl, tok)
