<a href="https://colab.research.google.com/github/zahraniayudyaa/finnalterm-dl/blob/main/02_SQuAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FINE-TUNING HUGGINGFACE MODELS (SQuAD)**

## **1. Setup dan Instalasi**

In [None]:
# 1. Setup
!pip install transformers datasets torch evaluate nltk rouge-score

import torch
import numpy as np
import pandas as pd
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
import evaluate
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

## **2. Load Dataset**

In [None]:
# 2. Load Dataset - SQuAD v2.0
print("Loading SQuAD dataset...")
dataset = load_dataset("rajpurkar/squad")

print("\nDataset structure:")
print(dataset)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")

# 3. Examine data structure
print("\nSample from training set:")
sample = dataset['train'][0]
print(f"Context: {sample['context'][:200]}...")
print(f"Question: {sample['question']}")
print(f"Answer: {sample['answers']['text'][0]}")
print(f"Answer start: {sample['answers']['answer_start'][0]}")

print("\nSample from validation set:")
sample_val = dataset['validation'][0]
print(f"Context: {sample_val['context'][:200]}...")
print(f"Question: {sample_val['question']}")
print(f"Answer: {sample_val['answers']['text'][0]}")

## **3. Preprocessing Data**

In [None]:
# 4. Preprocess Function for T5
MODEL_NAME = "t5-base"  # Using t5-base as specified in the task
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_squad_for_t5(examples):
    inputs = []
    targets = []

    for context, question, answers in zip(examples['context'], examples['question'], examples['answers']):
        # Format: "question: {question} context: {context}"
        input_text = f"question: {question} context: {context}"
        inputs.append(input_text)

        # Get answer text (take first answer if multiple)
        if len(answers['text']) > 0:
            target_text = answers['text'][0]
        else:
            target_text = ""  # For unanswerable questions
        targets.append(target_text)

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=384,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 5. Process Dataset
print("\nPreprocessing dataset...")
# Take subset for faster training (remove [:] for full dataset)
train_dataset = dataset['train'].select(range(5000))  # Reduced for demo
val_dataset = dataset['validation'].select(range(1000))

tokenized_train = train_dataset.map(preprocess_squad_for_t5, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_squad_for_t5, batched=True, remove_columns=val_dataset.column_names)


## **4. Load Model dan Training**

In [None]:
# 6. Load Model
print(f"\nLoading model: {MODEL_NAME}")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# 7. Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# 8. Load Evaluation Metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE scores
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # BLEU score
    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )

    # Exact Match
    exact_matches = sum([1 for p, l in zip(decoded_preds, decoded_labels) if p.strip() == l.strip()])
    exact_match = exact_matches / len(decoded_preds)

    # F1 Score (approximate)
    def compute_f1(pred, gold):
        pred_tokens = pred.lower().split()
        gold_tokens = gold.lower().split()

        common = set(pred_tokens) & set(gold_tokens)

        if len(common) == 0:
            return 0

        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(gold_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        return f1

    f1_scores = [compute_f1(p, l) for p, l in zip(decoded_preds, decoded_labels)]
    avg_f1 = np.mean(f1_scores) if f1_scores else 0

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"],
        "exact_match": exact_match,
        "f1": avg_f1
    }

# 9. Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_squad",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=3e-4,
    per_device_train_batch_size=4,  # Reduced for memory
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    logging_steps=100,
    gradient_accumulation_steps=2  # For effective batch size of 8
)

# 10. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train Model
print("\nTraining model...")
trainer.train()

## **5. Evaluasi**

In [None]:
# 12. Evaluate
print("\nEvaluating model...")
eval_results = trainer.evaluate()
print(f"\nEvaluation results:")
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")

# 13. Save Model
print("\nSaving model...")
trainer.save_model("./saved_model_t5_squad")
tokenizer.save_pretrained("./saved_model_t5_squad")

# 14. Inference Function
def answer_question(context, question, model, tokenizer, max_answer_length=50):
    input_text = f"question: {question} context: {context}"

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=384
    )

    input_ids = inputs.input_ids.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_answer_length,
            min_length=1,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2,
            temperature=0.7
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# 15. Test Examples
print("\n" + "="*80)
print("Question Answering Examples")
print("="*80)

test_examples = [
    {
        "context": """
        The University of Cambridge is a public collegiate research university in Cambridge, England.
        Founded in 1209, the University of Cambridge is the world's third-oldest university in continuous operation.
        The university's founding followed the arrival of scholars who left the University of Oxford for Cambridge after a dispute with local townspeople.
        """,
        "question": "When was the University of Cambridge founded?",
        "expected_answer": "1209"
    },
    {
        "context": """
        Machine learning is a field of artificial intelligence that uses statistical techniques to give
        computer systems the ability to learn from data, without being explicitly programmed.
        The name machine learning was coined in 1959 by Arthur Samuel, an American IBMer and pioneer
        in the field of computer gaming and artificial intelligence.
        """,
        "question": "Who coined the term machine learning?",
        "expected_answer": "Arthur Samuel"
    },
    {
        "context": """
        The Great Wall of China is a series of fortifications that were built across the historical
        northern borders of ancient Chinese states and Imperial China as protection against various
        nomadic groups from the Eurasian Steppe. The Great Wall construction started as early as
        the 7th century BC and continued until 1878 in the Qing dynasty.
        """,
        "question": "What was the purpose of the Great Wall of China?",
        "expected_answer": "protection against various nomadic groups from the Eurasian Steppe"
    }
]

model.eval()
for i, example in enumerate(test_examples, 1):
    context = example["context"].strip()
    question = example["question"]
    expected = example["expected_answer"]

    answer = answer_question(context, question, model, tokenizer)

    print(f"\nExample {i}:")
    print(f"Question: {question}")
    print(f"Expected Answer: {expected}")
    print(f"Model Answer: {answer}")
    print(f"Context (first 150 chars): {context[:150]}...")

# 16. Batch Evaluation
def evaluate_on_samples(model, tokenizer, dataset_samples, num_samples=20):
    results = []

    for i in range(min(num_samples, len(dataset_samples))):
        sample = dataset_samples[i]
        context = sample['context']
        question = sample['question']
        true_answer = sample['answers']['text'][0] if len(sample['answers']['text']) > 0 else ""

        predicted_answer = answer_question(context, question, model, tokenizer)

        # Simple exact match
        is_exact_match = predicted_answer.strip().lower() == true_answer.strip().lower()

        # Calculate F1
        def compute_simple_f1(pred, gold):
            pred_tokens = pred.lower().split()
            gold_tokens = gold.lower().split()

            common = set(pred_tokens) & set(gold_tokens)

            if len(common) == 0:
                return 0

            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(gold_tokens)
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            return f1

        f1_score = compute_simple_f1(predicted_answer, true_answer)

        results.append({
            "id": i,
            "question": question,
            "true_answer": true_answer,
            "predicted_answer": predicted_answer,
            "exact_match": is_exact_match,
            "f1": f1_score
        })

    # Calculate aggregate metrics
    exact_match_rate = sum([r["exact_match"] for r in results]) / len(results)
    avg_f1 = np.mean([r["f1"] for r in results])

    print(f"\nEvaluation on {len(results)} samples:")
    print(f"Exact Match Rate: {exact_match_rate:.2%}")
    print(f"Average F1 Score: {avg_f1:.4f}")

    # Show some examples
    print("\nSample predictions:")
    for i in range(min(3, len(results))):
        r = results[i]
        print(f"\n{i+1}. Question: {r['question']}")
        print(f"   True: {r['true_answer']}")
        print(f"   Pred: {r['predicted_answer']}")
        print(f"   Exact Match: {r['exact_match']}, F1: {r['f1']:.4f}")

    return results

# Run evaluation
print("\n" + "="*80)
print("Evaluating on SQuAD validation samples")
print("="*80)
eval_results = evaluate_on_samples(model, tokenizer, dataset['validation'], 10)