In [None]:
!pip install transformers torch numpy scikit-learn nltk rouge-score datasets sentencepiece tqdm

from datasets import load_dataset
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModel
from rouge_score import rouge_scorer
import numpy as np
import nltk
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Tải tập dữ liệu test
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")

# Định nghĩa các hằng số
T5_MODEL = 't5-base'
BERT_MODEL = 'bert-base-uncased'
MAX_LENGTH = 512
SUMMARY_MAX_LENGTH = 150

# Tải Tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
t5_tokenizer = T5Tokenizer.from_pretrained(T5_MODEL)

# Tải Mô hình
t5_model = T5ForConditionalGeneration.from_pretrained(T5_MODEL)
bert_model = AutoModel.from_pretrained(BERT_MODEL)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
# --- TÓM TẮT TRÍCH CHỌN (BERT EXTRACTIVE) ---
def bert_extractive_summarize(article, max_sentences=5):
    """Thực hiện tóm tắt Extractive dựa trên BERT embeddings."""
    try:
        nltk.data.find('tokenizers/punkt')
    except nltk.downloader.DownloadError:
        nltk.download('punkt')

    sentences = nltk.sent_tokenize(article)
    if not sentences:
        return ""

    # Tính Embedding cho toàn bộ văn bản (Doc Embedding)
    inputs_doc = bert_tokenizer(article, return_tensors='pt', truncation=True, padding=True, max_length=MAX_LENGTH)
    with torch.no_grad():
        doc_outputs = bert_model(**inputs_doc)
    doc_embedding = doc_outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    doc_embedding = doc_embedding.reshape(1, -1)

    # Tính Embedding cho từng câu (Sentence Embeddings)
    sent_embeddings = []
    for sentence in sentences:
        inputs_sent = bert_tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            sent_outputs = bert_model(**inputs_sent)
        sent_embedding = sent_outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        sent_embeddings.append(sent_embedding)

    sent_embeddings_array = np.array(sent_embeddings)

    # Tính điểm quan trọng (Cosine Similarity)
    scores = cosine_similarity(sent_embeddings_array, doc_embedding).flatten()

    # Trích chọn và sắp xếp
    ranked_indices = np.argsort(scores)[::-1]
    num_to_select = min(len(sentences), max_sentences)
    selected_indices = sorted(ranked_indices[:num_to_select])

    return " ".join([sentences[i] for i in selected_indices])

# --- TÓM TẮT TRỪU TƯỢNG (T5 ABSTRACTIVE) ---
def t5_abstractive_summarize(article, max_len=SUMMARY_MAX_LENGTH, min_len=40, num_beams=4):
    """Thực hiện tóm tắt Abstractive bằng mô hình T5."""

    # Thêm prefix theo yêu cầu của T5
    input_text = "summarize: " + article

    # Mã hóa đầu vào
    inputs = t5_tokenizer(input_text, max_length=MAX_LENGTH, return_tensors='pt', truncation=True)

    # Sinh chuỗi (sử dụng Beam Search)
    summary_ids = t5_model.generate(
        inputs['input_ids'],
        num_beams=num_beams,
        max_length=max_len,
        min_length=min_len,
        length_penalty=2.0,
        early_stopping=True
    )

    # Giải mã đầu ra
    abstractive_summary = t5_tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

    return abstractive_summary

In [None]:
import nltk

try:
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt_tab')
    nltk.download('punkt')


In [None]:
sample_index = 42
article_text = dataset[sample_index]['article']
reference_summary = dataset[sample_index]['highlights']

# --- Tóm tắt Extractive (BERT) ---
ext_summary = bert_extractive_summarize(article_text, max_sentences=4)
ext_scores = scorer.score(reference_summary, ext_summary)

# --- Tóm tắt Abstractive (T5) ---
abs_summary = t5_abstractive_summarize(article_text)
abs_scores = scorer.score(reference_summary, abs_summary)

print("--------------------------------------------------")
print(f"Văn bản gốc (trích đoạn): {article_text[:400]}...")
print("\n--- Tóm tắt Tham chiếu ---")
print(reference_summary)

print("\n--- 1. Extractive (BERT) ---")
print(ext_summary)
print(f"ROUGE-L F1: {ext_scores['rougeL'].fmeasure:.4f}")

print("\n--- 2. Abstractive (T5) ---")
print(abs_summary)
print(f"ROUGE-L F1: {abs_scores['rougeL'].fmeasure:.4f}")
print("--------------------------------------------------")

In [None]:
from datasets import load_dataset
import numpy as np


N_SAMPLES = 100
dataset_subset = load_dataset("cnn_dailymail", "3.0.0", split=f"test[:{N_SAMPLES}]")


results = {
    'extractive': {'rouge1': [], 'rouge2': [], 'rougeL': []},
    'abstractive': {'rouge1': [], 'rouge2': [], 'rougeL': []}
}

print(f"Bắt đầu đánh giá ROUGE trên {N_SAMPLES} mẫu...")

for i in range(N_SAMPLES):
    article = dataset_subset[i]['article']
    reference = dataset_subset[i]['highlights']

    #  Tóm tắt Extractive (BERT)
    try:
        ext_summary = bert_extractive_summarize(article, max_sentences=4)
        ext_scores = scorer.score(reference, ext_summary)
        for key in results['extractive']:
            results['extractive'][key].append(ext_scores[key].fmeasure)
    except Exception as e:
        print(f"Lỗi Extractive ở mẫu {i}: {e}")
        continue

    #  Tóm tắt Abstractive (T5)
    try:
        abs_summary = t5_abstractive_summarize(article)
        abs_scores = scorer.score(reference, abs_summary)
        for key in results['abstractive']:
            results['abstractive'][key].append(abs_scores[key].fmeasure)
    except Exception as e:
        print(f"Lỗi Abstractive ở mẫu {i}: {e}")
        continue

    if (i + 1) % 10 == 0:
        print(f"Đã xử lý {i + 1}/{N_SAMPLES} mẫu.")

# Tính điểm trung bình cuối cùng
avg_ext_results = {k: np.mean(v) for k, v in results['extractive'].items()}
avg_abs_results = {k: np.mean(v) for k, v in results['abstractive'].items()}

print("\n--- KẾT QUẢ ROUGE TRUNG BÌNH ---")
print("Extractive (BERT):")
for k, v in avg_ext_results.items():
    print(f"  {k.upper()} F1: {v:.4f}")

print("\nAbstractive (T5):")
for k, v in avg_abs_results.items():
    print(f"  {k.upper()} F1: {v:.4f}")
