In [1]:
# Install dependencies
!pip install transformers torch pandas keybert scikit-learn tqdm rouge_score bert_score

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-n

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from keybert import KeyBERT
from torch.utils.data import Dataset
from tqdm import tqdm
import numpy as np
import os

# Preprocess dataset to extract title keywords
def preprocess_dataset(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    kw_model = KeyBERT()
    
    def extract_keywords(title):
        try:
            keywords = kw_model.extract_keywords(
                title, keyphrase_ngram_range=(1, 7), top_n=10
            )
            return ' '.join([kw[0] for kw in keywords])
        except:
            return ''
    
    df['title_keywords'] = df['title'].apply(extract_keywords)
    df.to_csv(output_csv, index=False)
    return df

# Custom Dataset for Flan-T5
class KeywordDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):
        self.df = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prompt = "Extract keywords from the following abstract: "

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        abstract = str(self.df.iloc[idx]['abstract'])
        keywords = str(self.df.iloc[idx]['title_keywords'])

        # Thêm prompt vào đầu abstract
        input_text = f"{self.prompt}{abstract}"
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Mã hóa keywords (đầu ra)
        target_encoding = self.tokenizer(
            keywords,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Training Function for Flan-T5
def train_flan_t5(model, train_dataset, val_dataset, device, output_dir='/kaggle/working/flan_t5_finetuned'):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        fp16=torch.cuda.is_available(),  # Sử dụng mixed precision nếu có GPU
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    trainer.train()

    # Lưu mô hình
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Main Execution
if __name__ == "__main__":
    # File paths
    train_csv = '/kaggle/input/springer-journal-final/train.csv'
    val_csv = '/kaggle/input/springer-journal-final/val.csv'
    processed_train_csv = '/kaggle/working/processed_train.csv'
    processed_val_csv = '/kaggle/working/processed_val.csv'

    # Preprocess datasets
    preprocess_dataset(train_csv, processed_train_csv)
    preprocess_dataset(val_csv, processed_val_csv)

    # Initialize device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
    model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base').to(device)

    # Initialize datasets
    train_dataset = KeywordDataset(processed_train_csv, tokenizer)
    val_dataset = KeywordDataset(processed_val_csv, tokenizer)

    # Train Flan-T5
    train_flan_t5(model, train_dataset, val_dataset, device)

    # Save keyword vocabulary (tương tự code gốc)
    train_data = pd.read_csv(processed_train_csv)
    train_keywords = train_data['title_keywords'].apply(lambda x: x.split() if x else []).values
    keyword_counts = {}
    for sublist in train_keywords:
        for kw in sublist:
            keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
    top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:1000]
    keyword_vocab = [kw for kw, _ in top_keywords]
    with open('/kaggle/working/keyword_vocab.txt', 'w') as f:
        f.write('\n'.join(keyword_vocab))

2025-05-14 01:07:52.421132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747184872.625613      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747184872.687081      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


  if operator(metric_value, self.state.best_metric):
  if operator(metric_value, self.state.best_metric):
  if operator(metric_value, self.state.best_metric):


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import os
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# Preprocess dataset to extract title keywords
def preprocess_dataset(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    kw_model = KeyBERT()
    
    def extract_keywords(title):
        try:
            keywords = kw_model.extract_keywords(
                title, keyphrase_ngram_range=(1, 7), top_n=10
            )
            return ' '.join([kw[0] for kw in keywords])
        except:
            return ''
    
    df['title_keywords'] = df['title'].apply(extract_keywords)
    df.to_csv(output_csv, index=False)
    return df

# Flan-T5-based keyword prediction function
def predict_keywords(abstract, flan_model, flan_tokenizer, max_len=512, device='cuda'):
    flan_model.eval()
    prompt = "Extract keywords from the following abstract: "
    input_text = f"{prompt}{abstract}"
    
    encoding = flan_tokenizer(
        input_text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = flan_model.generate(
            input_ids=encoding['input_ids'].to(device),
            attention_mask=encoding['attention_mask'].to(device),
            max_length=128,
            num_beams=5
        )
    
    keywords = flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Chuyển chuỗi từ khóa thành danh sách
    predicted_keywords = [kw.strip() for kw in keywords.split(',') if kw.strip()]
    return predicted_keywords

# Custom Dataset
class ScientificPaperDataset(Dataset):
    def __init__(self, csv_file, flan_tokenizer, flan_keyword_model, flan_keyword_tokenizer, max_length=512, device='cuda'):
        self.df = pd.read_csv(csv_file)
        self.flan_tokenizer = flan_tokenizer
        self.flan_keyword_model = flan_keyword_model
        self.flan_keyword_tokenizer = flan_keyword_tokenizer
        self.max_length = max_length
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        abstract = str(self.df.iloc[idx]['abstract'])
        title = str(self.df.iloc[idx]['title'])
        title_keywords = str(self.df.iloc[idx]['title_keywords'])

        # Predict abstract keywords using Flan-T5
        abstract_keywords = predict_keywords(abstract, self.flan_keyword_model, self.flan_keyword_tokenizer, device=self.device)
        abstract_keywords_text = ' '.join(abstract_keywords) if abstract_keywords else ''

        # Input for Flan-T5: Flan-T5-predicted keywords + abstract
        input_text = f"Instruction: Generate a concise and informative title for scientific research based on keywords: {abstract_keywords_text} \n and abstract:\n{abstract}"

        input_encoding = self.flan_tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        title_encoding = self.flan_tokenizer(
            title,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'title_keywords': title_keywords,
            'abstract_keywords': abstract_keywords_text,
            'title': title
        }

# Multi-task Flan-T5 Model
class MultiTaskFlanT5(nn.Module):
    def __init__(self, flan_model_name='google/flan-t5-base'):
        super(MultiTaskFlanT5, self).__init__()
        self.flan = T5ForConditionalGeneration.from_pretrained(flan_model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(flan_model_name)
        self.tfidf = TfidfVectorizer(max_features=1000)

    def forward(self, input_ids, attention_mask, title_input_ids, title_keywords, abstract_keywords):
        outputs = self.flan(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=title_input_ids
        )
        title_loss = outputs.loss

        try:
            keyword_vectors = self.tfidf.fit_transform([title_keywords, abstract_keywords]).toarray()
            title_kw_vector = torch.tensor(keyword_vectors[0], dtype=torch.float32)
            abstract_kw_vector = torch.tensor(keyword_vectors[1], dtype=torch.float32)
            cosine_sim = nn.functional.cosine_similarity(title_kw_vector, abstract_kw_vector, dim=0)
            keyword_loss = 1 - cosine_sim
        except:
            keyword_loss = torch.tensor(0.0, device=input_ids.device)

        return title_loss, keyword_loss

# Training and Validation Function
def train_and_validate(model, train_loader, val_loader, device, epochs=3, lr=5e-5, title_loss_weight=1.0, keyword_loss_weight=0.5):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            title_input_ids = batch['title_input_ids'].to(device)
            title_keywords = batch['title_keywords']
            abstract_keywords = batch['abstract_keywords']

            title_loss, keyword_loss = model(
                input_ids, attention_mask, title_input_ids,
                title_keywords, abstract_keywords
            )

            loss = title_loss_weight * title_loss + keyword_loss_weight * keyword_loss
            total_train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}")

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                title_input_ids = batch['title_input_ids'].to(device)
                title_keywords = batch['title_keywords']
                abstract_keywords = batch['abstract_keywords']

                title_loss, keyword_loss = model(
                    input_ids, attention_mask, title_input_ids,
                    title_keywords, abstract_keywords
                )

                loss = title_loss_weight * title_loss + keyword_loss_weight * keyword_loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}, Average Validation Loss: {avg_val_loss:.4f}")

# Evaluation Function with ROUGE and BERTScore
def evaluate_model(model, test_loader, device):
    model.eval()
    generated_titles = []
    reference_titles = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            titles = batch['title']

            # Generate titles
            generated_ids = model.flan.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=32,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            generated_title = [model.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

            generated_titles.extend(generated_title)
            reference_titles.extend(titles)

            # Compute ROUGE scores
            for gen, ref in zip(generated_title, titles):
                scores = scorer.score(ref, gen)
                for metric in rouge_scores:
                    rouge_scores[metric].append(scores[metric].fmeasure)

    # Compute BERTScore
    try:
        P, R, F1 = bert_score(generated_titles, reference_titles, lang="en", verbose=True)
        bertscore_f1 = F1.mean().item()
    except:
        bertscore_f1 = 0.0

    # Average ROUGE scores
    avg_rouge = {metric: np.mean(scores) for metric, scores in rouge_scores.items()}

    # Print results
    print("\nEvaluation Results:")
    print(f"ROUGE-1: {avg_rouge['rouge1']:.4f}")
    print(f"ROUGE-2: {avg_rouge['rouge2']:.4f}")
    print(f"ROUGE-L: {avg_rouge['rougeL']:.4f}")
    print(f"ROUGE-Lsum: {avg_rouge['rougeLsum']:.4f}")
    print(f"BERTScore F1: {bertscore_f1:.4f}")

    # Save generated titles
    results_df = pd.DataFrame({
        'reference_title': reference_titles,
        'generated_title': generated_titles
    })
    results_df.to_csv('/kaggle/working/generated_titles.csv', index=False)

    return avg_rouge, bertscore_f1

# Main Execution
if __name__ == "__main__":
    # File paths
    train_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/train.csv'
    val_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/val.csv'
    test_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/test.csv'
    processed_train_csv = '/kaggle/working/processed_train.csv'
    processed_val_csv = '/kaggle/working/processed_val.csv'
    processed_test_csv = '/kaggle/working/processed_test.csv'

    # Preprocess datasets
    preprocess_dataset(train_csv, processed_train_csv)
    preprocess_dataset(val_csv, processed_val_csv)
    preprocess_dataset(test_csv, processed_test_csv)

    # Initialize device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load Flan-T5 model and tokenizer for keyword prediction
    flan_model_path = '/kaggle/working/flan_t5_finetuned'
    if not os.path.exists(flan_model_path):
        raise FileNotFoundError(f"Flan-T5 model directory {flan_model_path} does not exist. Please run the fine-tuning script first.")
    flan_keyword_model = T5ForConditionalGeneration.from_pretrained(flan_model_path).to(device)
    flan_keyword_tokenizer = T5Tokenizer.from_pretrained(flan_model_path)

    # Initialize Flan-T5 tokenizer and datasets
    flan_tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
    train_dataset = ScientificPaperDataset(processed_train_csv, flan_tokenizer, flan_keyword_model, flan_keyword_tokenizer, device=device)
    val_dataset = ScientificPaperDataset(processed_val_csv, flan_tokenizer, flan_keyword_model, flan_keyword_tokenizer, device=device)
    test_dataset = ScientificPaperDataset(processed_test_csv, flan_tokenizer, flan_keyword_model, flan_keyword_tokenizer, device=device)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    # Initialize Flan-T5 model
    flan_model = MultiTaskFlanT5().to(device)

    # Train and validate
    train_and_validate(flan_model, train_loader, val_loader, device, epochs=3, lr=5e-5, 
                       title_loss_weight=1.0, keyword_loss_weight=0.5)

    # Evaluate on test set
    rouge_results, bertscore_result = evaluate_model(flan_model, test_loader, device)

    # Save the model
    flan_model.flan.save_pretrained('/kaggle/working/multi_task_flan_t5_model')
    flan_model.tokenizer.save_pretrained('/kaggle/working/multi_task_flan_t5_model')

Training Epoch 1: 100%|██████████| 446/446 [1:12:09<00:00,  9.71s/it]


Epoch 1, Average Training Loss: 2.2186


Validation Epoch 1: 100%|██████████| 56/56 [08:36<00:00,  9.22s/it]


Epoch 1, Average Validation Loss: 0.2270


Training Epoch 2: 100%|██████████| 446/446 [1:11:21<00:00,  9.60s/it]


Epoch 2, Average Training Loss: 0.2267


Validation Epoch 2: 100%|██████████| 56/56 [08:44<00:00,  9.37s/it]


Epoch 2, Average Validation Loss: 0.2029


Training Epoch 3: 100%|██████████| 446/446 [1:11:46<00:00,  9.66s/it]


Epoch 3, Average Training Loss: 0.1998


Validation Epoch 3: 100%|██████████| 56/56 [08:41<00:00,  9.31s/it]


Epoch 3, Average Validation Loss: 0.1969


Evaluating: 100%|██████████| 56/56 [09:09<00:00,  9.81s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 1.17 seconds, 190.37 sentences/sec

Evaluation Results:
ROUGE-1: 0.6357
ROUGE-2: 0.4435
ROUGE-L: 0.5430
ROUGE-Lsum: 0.5430
BERTScore F1: 0.9193
