<a href="https://colab.research.google.com/github/zahraniayudyaa/finnalterm-dl/blob/main/03_XSum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FINE-TUNING HUGGINGFACE MODELS (XSum)**

## **1. Setup dan Instalasi**

In [None]:
# 1. Setup dan Instalasi
!pip install transformers datasets torch peft bitsandbytes accelerate trl scipy rouge-score -q

import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)
from datasets import load_dataset
from trl import SFTTrainer
import gc
from rouge_score import rouge_scorer
import warnings
warnings.filterwarnings('ignore')

## **2. Load Dataset**

In [None]:
# 2. Load Dataset - XSum
print("Loading XSum dataset...")
dataset = load_dataset("EdinburghNLP/xsum")

print("\nDataset structure:")
print(dataset)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

# 3. Examine data
print("\nSample from training set:")
sample = dataset['train'][0]
print(f"Document (first 200 chars): {sample['document'][:200]}...")
print(f"Summary: {sample['summary']}")
print(f"ID: {sample['id']}")

print(f"\nDocument length: {len(sample['document'].split())} words")
print(f"Summary length: {len(sample['summary'].split())} words")

# 4. Configure 4-bit quantization untuk menghemat memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

## **3. Load Model**

In [None]:
# 5. Load Model dan Tokenizer - Phi-2
MODEL_NAME = "microsoft/phi-2"

print(f"\nLoading model: {MODEL_NAME}")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    # Tambahkan padding token jika tidak ada
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.float16
    )

    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    # Fallback to smaller model if Phi-2 fails
    MODEL_NAME = "facebook/opt-1.3b"
    print(f"Falling back to: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

# Set model config
model.config.pad_token_id = tokenizer.pad_token_id

# 6. Prepare Model for PEFT (Parameter-Efficient Fine-Tuning)
model = prepare_model_for_kbit_training(model)

# 7. Configure LoRA
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # LoRA alpha
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],  # Target modules for Phi-2/OPT
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 8. Preprocess Function untuk Summarization
def preprocess_xsum(examples):
    prompts = []

    for doc, summary in zip(examples['document'], examples['summary']):
        # Format instruction untuk summarization
        prompt = f"### Document:\n{doc}\n\n### Summary:\n{summary}"
        prompts.append(prompt)

    # Tokenize
    tokenized = tokenizer(
        prompts,
        truncation=True,
        max_length=1024,  # Batasi panjang untuk memory
        padding=False
    )

    # Untuk causal LM, labels sama dengan input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

## **4. Preprocessing Data**

In [None]:
# 9. Process Dataset
print("\nPreprocessing dataset...")
# Gunakan subset untuk demo (hapus [:] untuk full training)
train_subset = dataset['train'].select(range(2000))  # Small subset for demo
val_subset = dataset['validation'].select(range(200))

tokenized_train = train_subset.map(preprocess_xsum, batched=True, remove_columns=train_subset.column_names)
tokenized_val = val_subset.map(preprocess_xsum, batched=True, remove_columns=val_subset.column_names)

print(f"Training samples: {len(tokenized_train)}")
print(f"Validation samples: {len(tokenized_val)}")

# 10. Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

## **5. Training Model**

In [None]:
# 11. Training Arguments
training_args = TrainingArguments(
    output_dir="./results_xsum",
    num_train_epochs=2,  # Reduced for demo
    per_device_train_batch_size=2,  # Small batch size karena model besar
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    warmup_steps=50,
    logging_steps=20,
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    optim="paged_adamw_8bit",  # Optimizer untuk 8-bit training
    gradient_checkpointing=True,  # Menghemat memory
    remove_unused_columns=False
)

# 12. Initialize Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    max_seq_length=1024,
)

# 13. Train Model
print("\nTraining model... (This may take a while)")
trainer.train()

# 14. Save Model
print("\nSaving model...")
trainer.save_model("./saved_model_phi2_xsum")
tokenizer.save_pretrained("./saved_model_phi2_xsum")

In [None]:
# 15. Generate Summary Function
def generate_summary(document, model, tokenizer, max_length=150, temperature=0.7):
    # Create prompt
    prompt = f"### Document:\n{document}\n\n### Summary:\n"

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512  # Batasi input length
    )

    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            min_length=30,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            temperature=temperature,
            do_sample=True,
            top_p=0.9
        )

    # Decode output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract summary (part after "### Summary:")
    if "### Summary:" in full_output:
        summary = full_output.split("### Summary:")[-1].strip()
    else:
        summary = full_output

    return summary

In [None]:
# 16. Test Generation
print("\n" + "="*80)
print("Testing Summarization")
print("="*80)

# Ambil beberapa contoh dari test set
test_samples = dataset['test'].select(range(3))

model.eval()
for i, sample in enumerate(test_samples):
    document = sample['document']
    true_summary = sample['summary']

    print(f"\nExample {i+1}:")
    print("-" * 60)

    # Print document preview
    print(f"Document (first 300 chars):")
    print(document[:300] + "..." if len(document) > 300 else document)
    print(f"\nDocument length: {len(document.split())} words")

    # True summary
    print(f"\nTrue Summary:")
    print(true_summary)
    print(f"Summary length: {len(true_summary.split())} words")

    # Generate summary
    generated_summary = generate_summary(document, model, tokenizer, max_length=100)
    print(f"\nGenerated Summary:")
    print(generated_summary)
    print(f"Generated length: {len(generated_summary.split())} words")

    print("-" * 60)

# 17. ROUGE Evaluation Function
def evaluate_rouge_summaries(generated_summaries, reference_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for gen, ref in zip(generated_summaries, reference_summaries):
        scores = scorer.score(ref, gen)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    return {
        "rouge1": np.mean(rouge1_scores),
        "rouge2": np.mean(rouge2_scores),
        "rougeL": np.mean(rougeL_scores)
    }


## **6. Evaluasi**

In [None]:
# 18. Evaluate on Validation Set
print("\n" + "="*80)
print("ROUGE Evaluation on Validation Set")
print("="*80)

# Ambil subset dari validation set untuk evaluasi
eval_samples = dataset['validation'].select(range(10))

references = []
generated = []

print("\nEvaluating summaries...")
for i, sample in enumerate(eval_samples):
    document = sample['document']
    true_summary = sample['summary']

    # Generate summary
    summary = generate_summary(document, model, tokenizer, max_length=100)

    references.append(true_summary)
    generated.append(summary)

    print(f"Sample {i+1}:")
    print(f"  Reference: {true_summary[:80]}..." if len(true_summary) > 80 else f"  Reference: {true_summary}")
    print(f"  Generated: {summary[:80]}..." if len(summary) > 80 else f"  Generated: {summary}")

# Calculate ROUGE scores
rouge_scores = evaluate_rouge_summaries(generated, references)

print(f"\nROUGE Scores:")
print(f"  ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"  ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"  ROUGE-L: {rouge_scores['rougeL']:.4f}")

# 19. Interactive Demo
print("\n" + "="*80)
print("Interactive Summarization Demo")
print("="*80)

def interactive_demo():
    print("\nEnter a document to summarize (type 'quit' to exit):")
    print("-" * 60)

    while True:
        print("\nEnter your document (or 'quit'):")
        document = input("> ")

        if document.lower() == 'quit':
            break

        if len(document.strip()) < 50:
            print("Document too short. Please enter at least 50 characters.")
            continue

        print("\nGenerating summary...")
        summary = generate_summary(document, model, tokenizer, max_length=100)

        print(f"\nGenerated Summary:")
        print(summary)

        print(f"\nSummary length: {len(summary.split())} words")
        print("-" * 60)