In [None]:
%%capture
%pip install transformers datasets accelerate evaluate bert_score rouge_score bitsandbytes

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
print("CUDA is available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

import evaluate
import pandas as pd
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    T5ForConditionalGeneration,
    BartForConditionalGeneration,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
)

# Prepare Data

In [None]:
# Define paths
model_name = "google/flan-t5-base"
keyword_model_name = "bloomberg/KeyBART"
model_alias = model_name.split('/')[-1].strip()
trainer_output_dir = f"/kaggle/working/{model_alias}_output"
trainer_log_dir = f"/kaggle/working/{model_alias}_logs"
savepath = f"/kaggle/working/custom-{model_alias}"

datapath = "/kaggle/input/springerjournal-450tk-0-7cosine/"

print("Save path:\t", savepath)
print("Log path:\t", trainer_log_dir)
print("Output path:\t", trainer_output_dir)

# Load dataset
dataset = load_dataset('csv', data_files={
    'train': os.path.join(datapath, 'train.csv'),
    'val': os.path.join(datapath, 'val.csv'),
    'test': os.path.join(datapath, 'test.csv')
})
train_dataset = dataset['train']
val_dataset = dataset['val']
test_dataset = dataset['test']

# Check dataset
print("Train dataset sample:", train_dataset[0])
print("Validation dataset sample:", val_dataset[0])
print("Test dataset sample:", test_dataset[0])

# Load tokenizer and models
tokenizer = AutoTokenizer.from_pretrained(model_name)
keyword_tokenizer = AutoTokenizer.from_pretrained(keyword_model_name)
keyword_model = BartForConditionalGeneration.from_pretrained(keyword_model_name).to("cuda")

# Function to extract keywords using bloomberg/KeyBART
def extract_keywords(text):
    inputs = keyword_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = keyword_model.generate(
            **inputs,
            max_length=150,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=7,
            length_penalty=1.0
        )
    keywords_text = keyword_tokenizer.decode(outputs[0], skip_special_tokens=True)
    keywords = [kw.strip() for kw in keywords_text.split(",")][:10]
    return keywords

# Preprocess data: Add keywords and prompt
def preprocess_data(examples):
    abstracts = examples['abstract']
    titles = examples['title']
    
    input_texts = []
    target_texts = []
    keywords_list = []
    
    for abstract, title in zip(abstracts, titles):
        keywords = extract_keywords(abstract)
        keywords_str = ", ".join(keywords)
        abstract_with_keywords = f"{abstract}\nKeywords: {keywords_str}"
        input_text = f"Generate a concise and general title that captures the main idea of the following abstract: {abstract_with_keywords}"
        
        input_texts.append(input_text)
        target_texts.append(title)
        keywords_list.append(keywords)
    
    return {
        'input_text': input_texts,
        'target_text': target_texts,
        'keywords': keywords_list
    }

# Apply preprocessing
train_processed = train_dataset.map(preprocess_data, batched=True, remove_columns=['abstract', 'title'])
val_processed = val_dataset.map(preprocess_data, batched=True, remove_columns=['abstract', 'title'])
test_processed = test_processed = test_dataset.map(preprocess_data, batched=True, remove_columns=['abstract', 'title'])

# Tokenize data
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    inputs["decoder_input_ids"] = labels["input_ids"]  # T5 requires decoder_input_ids
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    return inputs

tokenized_train = train_processed.map(tokenize_function, batched=True)
tokenized_val = val_processed.map(tokenize_function, batched=True)
tokenized_test = test_processed.map(tokenize_function, batched=True)

# Custom Loss Function
from torch import nn
import torch.nn.functional as F

class CustomT5ForSeq2SeqLM(T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        self.tokenizer = None
        self.keyword_weight = None

    def set_custom_params(self, tokenizer, keyword_weight=1.0):
        self.tokenizer = tokenizer
        self.keyword_weight = keyword_weight

    def forward(self, input_ids, attention_mask=None, labels=None, keywords=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )
        
        loss = outputs.loss
        
        if keywords is not None and labels is not None and self.tokenizer is not None and self.keyword_weight is not None:
            generated_texts = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
            keyword_penalty = 0.0
            for i, gen_text in enumerate(generated_texts):
                keyword_list = keywords[i]
                penalty = sum(1 for keyword in keyword_list if keyword.lower() not in gen_text.lower())
                keyword_penalty += penalty / len(keyword_list) if keyword_list else 0
            
            keyword_penalty = keyword_penalty / len(generated_texts) if generated_texts else 0
            loss = loss + self.keyword_weight * keyword_penalty
        
        return type(outputs)(loss=loss, logits=outputs.logits)

# Initialize custom model
custom_model = CustomT5ForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
custom_model.set_custom_params(tokenizer=tokenizer, keyword_weight=1.0)

# Custom Data Collator
class CustomDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features):
        keywords = [f['keywords'] for f in features]
        features_without_keywords = [
            {k: v for k, v in f.items() if k != 'keywords'} for f in features
        ]
        batch = super().__call__(features_without_keywords)
        batch['keywords'] = keywords
        return batch

data_collator = CustomDataCollator(tokenizer=tokenizer, model=custom_model)

# Fine-tune model
training_args = TrainingArguments(
    fp16=False,
    bf16=True,
    output_dir=trainer_output_dir,
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir=trainer_log_dir,
    logging_steps=200,
    report_to="none"
)

trainer = Trainer(
    model=custom_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

trainer.train()

# Save fine-tuned model
custom_model.save_pretrained(savepath)
tokenizer.save_pretrained(savepath)

# Free memory
try:
    import gc
    del trainer
    del custom_model
    del keyword_model
    torch.cuda.empty_cache()
    gc.collect()
except Exception as e:
    print(e)

# Load fine-tuned model with quantization
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = T5ForConditionalGeneration.from_pretrained(savepath, quantization_config=quantization_conf, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(savepath)

# Function to predict title
def predict_title(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    outputs = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prepare dataloader for test set
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch]).to("cuda")
    labels = torch.tensor([item["labels"] for item in batch]).to("cuda")
    return {"input_ids": input_ids, "labels": labels}

eval_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=collate_fn)

predictions = []
references = []

# Predict on test set
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        outputs = model.generate(batch["input_ids"])
        
        pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ref_texts = tokenizer.batch_decode(batch["labels"].tolist(), skip_special_tokens=True)

        predictions.extend(pred_texts)
        references.extend(ref_texts)

# Compute ROUGE and BERTScore
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=predictions, references=references)
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

# Clear output before printing results
from IPython.display import clear_output
clear_output()

# Print results
print("ROUGE:", rouge_scores)
print("BERTScore (averaged):")
print("  Precision:", sum(bert_scores["precision"]) / len(bert_scores["precision"]))
print("  Recall:", sum(bert_scores["recall"]) / len(bert_scores["recall"]))
print("  F1:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))

# Compute detailed ROUGE and BERTScore for each pair
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
rougeLsum_scores = []

for pred, ref in zip(predictions, references):
    rouge_result = rouge.compute(predictions=[pred], references=[ref])
    rouge1_scores.append(rouge_result['rouge1'])
    rouge2_scores.append(rouge_result['rouge2'])
    rougeL_scores.append(rouge_result['rougeL'])
    rougeLsum_scores.append(rouge_result['rougeLsum'])

# Save detailed results to CSV
results_df = pd.DataFrame({
    'true_title': references,
    'predicted_title': predictions,
    'rouge1': rouge1_scores,
    'rouge2': rouge2_scores,
    'rougeL': rougeL_scores,
    'rougeLsum': rougeLsum_scores,
    'bertscore_precision': bert_scores['precision'],
    'bertscore_recall': bert_scores['recall'],
    'bertscore_f1': bert_scores['f1']
})
results_df.to_csv(f'/kaggle/working/{model_alias}_evaluation_results.csv', index=False)
print(f"Đã lưu kết quả chi tiết vào '/kaggle/working/{model_alias}_evaluation_results.csv'")