In [1]:
%%capture
%pip install "protobuf<=3.20.3" --force-reinstall
%pip install transformers evaluate
%env TOKENIZERS_PARALLELISM=false

In [2]:
from transformers import (
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    DataCollatorForSeq2Seq,
    AutoTokenizer, 
    AutoModelForSeq2SeqLM
)
import torch, evaluate
import torch.nn as nn
import pandas as pd
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
import gc
gc.collect()
torch.cuda.empty_cache()

from IPython.display import clear_output
clear_output()


# Data preproessing

In [3]:
def prepare_datasets(train_df, val_df, test_df, tokenizer, schema_text: str, max_input_length=512, max_target_length=128):
    
    def preprocess_function(examples):

        # Extract inputs and targets from the batch
        inputs = ["[Q]\n" + str(q) + "\n\n" + schema_text for q in examples['question']]
        targets = [str(a) for a in examples['answer']]
        
        # Tokenize inputs
        model_inputs = tokenizer(
            inputs,
            max_length=max_input_length,
            truncation=True,
            padding="max_length",
            add_special_tokens=True,
            return_tensors=None,  # Return lists for batched processing
        )
        
        # Tokenize targets
        target_encodings = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
            add_special_tokens=True,
            return_tensors=None,  # Return lists for batched processing
        )
        
        # Process labels and replace padding tokens with -100
        model_inputs["labels"] = []
        for label_sequence in target_encodings["input_ids"]:
            # Replace padding token IDs with -100 so they're ignored in loss calculation
            processed_labels = [
                -100 if token_id == tokenizer.pad_token_id else token_id 
                for token_id in label_sequence
            ]
            model_inputs["labels"].append(processed_labels)
        
        return model_inputs
    
    print(f"Train: {len(train_df)} samples, Val: {len(val_df)} samples, Test: {len(test_df)} samples")
    
    # Convert to datasets
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    print("Original dataset columns:", train_dataset.column_names)
    
    # Tokenize datasets
    print("Tokenizing datasets...")
    train_dataset = train_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=train_dataset.column_names,
        desc="Tokenizing train dataset"
    )
    val_dataset = val_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=val_dataset.column_names,
        desc="Tokenizing validation dataset"
    )
    test_dataset = test_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=test_dataset.column_names,
        desc="Tokenizing test dataset"
    )
    
    print("Processed dataset columns:", train_dataset.column_names)
    print(f"Dataset shapes - Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
    
    return train_dataset, val_dataset, test_dataset

# Training Configuration

In [4]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class TrainingConfig:
    """Configuration class for training parameters"""
    model_name: str = "VietAI/vit5-base"
    output_dir: str = "./results"
    
    # Training hyperparameters
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 1
    per_device_eval_batch_size: int = 2
    gradient_accumulation_steps: int = 8
    learning_rate: float = 5e-5
    lr_scheduler_type: str = "cosine" # added this
    weight_decay: float = 0.01
    warmup_steps: int = 500
    
    # Sequence lengths
    max_input_length: int = 512
    max_target_length: int = 128
    
    # Generation parameters
    num_beams: int = 4
    max_new_tokens: int = 128
    
    # Evaluation and saving
    eval_strategy: str = "epoch"
    eval_steps: int = 100
    save_strategy: str = "epoch"
    save_steps: int = 200
    save_total_limit: int = 3
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_rougeLsum"
    greater_is_better: bool = True
    
    # Logging
    logging_dir: Optional[str] = None
    logging_steps: int = 100
    logging_strategy: str = "steps"
    report_to: str = "none"
    
    # Other settings
    dataloader_num_workers: int = 4
    fp16: bool = True
    seed: int = 42
    disable_tqdm: bool = False
    
    def __post_init__(self):
        if self.logging_dir is None:
            self.logging_dir = f"{self.output_dir}/logs"


# Main pipeline

In [5]:
class NLQ2CypherTrainer:
    """
    Main training pipeline for summarization models
    """
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.tokenizer = None
        self.model = None
        self.trainer = None
        
    def setup_model_and_tokenizer(self):
        """Load model and tokenizer"""
        print(f"Loading model and tokenizer: {self.config.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_name)
        
        special_tokens = {
            'additional_special_tokens': ['[N]', '[R]', '[Q]']
        }
        self.tokenizer.add_special_tokens(special_tokens)
        
        self.model.resize_token_embeddings(len(self.tokenizer))
        
        # Add special tokens if needed
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print(f"Model loaded with {self.model.num_parameters():,} parameters")
        
    def prepare_training_args(self):
        """Prepare training arguments"""
        return Seq2SeqTrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=self.config.learning_rate,
            lr_scheduler_type=self.config.lr_scheduler_type,
            weight_decay=self.config.weight_decay,
            warmup_steps=self.config.warmup_steps,
            logging_dir=self.config.logging_dir,
            logging_steps=self.config.logging_steps,
            logging_strategy=self.config.logging_strategy,
            eval_strategy=self.config.eval_strategy,
            eval_steps=self.config.eval_steps,
            save_strategy=self.config.save_strategy,
            save_steps = self.config.save_steps,
            save_total_limit=self.config.save_total_limit,
            load_best_model_at_end=self.config.load_best_model_at_end,
            metric_for_best_model=self.config.metric_for_best_model,
            greater_is_better=self.config.greater_is_better,
            dataloader_num_workers=self.config.dataloader_num_workers,
            fp16=self.config.fp16,
            seed=self.config.seed,
            report_to=self.config.report_to,
            max_grad_norm=1.0,
            # Generation parameters for evaluation
            predict_with_generate=True,
            generation_max_length=self.config.max_new_tokens,
            generation_num_beams=self.config.num_beams,
            # Include loss in metrics
            include_inputs_for_metrics=False,
        )
        
    def compute_metrics(self, eval_pred):
        """Compute metrics for evaluation"""
        predictions, labels = eval_pred
        
        # Handle tuple predictions (when model returns loss and logits)
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        
        # When predict_with_generate=True, predictions are already token IDs
        # Ensure predictions are integers (token IDs)
        if predictions.dtype == np.float32 or predictions.dtype == np.float64:
            # If predictions are logits, take argmax
            predictions = np.argmax(predictions, axis=-1)
        
        # Ensure predictions are the right integer type and clip to valid range
        predictions = predictions.astype(np.int64)
        
        # Clip predictions to valid token ID range (0 to vocab_size - 1)
        vocab_size = self.tokenizer.vocab_size
        predictions = np.clip(predictions, 0, vocab_size - 1)
        
        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        labels = labels.astype(np.int64)
        
        # Clip labels to valid token ID range as well
        labels = np.clip(labels, 0, vocab_size - 1)
        
        # Decode predictions and labels
        decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        # Compute exact match
        exact_matches = [pred.strip() == label.strip() 
                        for pred, label in zip(decoded_preds, decoded_labels)]
        exact_match = sum(exact_matches) / len(decoded_preds) if len(decoded_preds) > 0 else 0.0
        
        print(f"Exact Match Score: {exact_match:.4f} ({sum(exact_matches)}/{len(decoded_preds)})")
        
        return {"exact_match": exact_match}
    
    def train(self, train_dataset, val_dataset):
        """Train the model"""
        if self.tokenizer is None or self.model is None:
            raise ValueError("Model and tokenizer must be set up first")
            
        # Prepare training arguments
        training_args = self.prepare_training_args()
        
        # Data collator
        data_collator = DataCollatorForSeq2Seq(
            self.tokenizer,
            model=self.model,
            label_pad_token_id=-100,
        )
        
        # Initialize custom trainer with Cross Entropy Loss
        self.trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
        )
        
        print(f"Total training steps: {len(train_dataset) // (self.config.per_device_train_batch_size * self.config.gradient_accumulation_steps) * self.config.num_train_epochs}")
        train_result = self.trainer.train()
        
        # Save the final model
        self.trainer.save_model()
        self.tokenizer.save_pretrained(self.config.output_dir)
        
        return train_result
    
    def evaluate(self, test_dataset):
        """Evaluate the model on test dataset"""
        if self.trainer is None:
            raise ValueError("Model must be trained first")
            
        print("Evaluating on test dataset...")
        eval_result = self.trainer.evaluate(test_dataset)
        
        return eval_result
    
    def generate_cypher_query(self, text, max_length=None, num_beams=None):
        """Generate query for a single text"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model and tokenizer must be set up first")
            
        max_length = max_length or self.config.max_new_tokens
        num_beams = num_beams or self.config.num_beams
        
        inputs = self.tokenizer(
            text, 
            max_length=self.config.max_input_length, 
            truncation=True, 
            return_tensors="pt"
        )
        
        with torch.no_grad():
            query_ids = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_length,
                num_beams=num_beams,
                early_stopping=True
            )
        
        result = self.tokenizer.decode(query_ids[0], skip_special_tokens=True)
        return result

# Run

In [6]:
######## Define input and output path ########
model_name = "vinai/bartpho-syllable"
input_path = "/kaggle/input/vikg-nlq-2-cypher/ViKG-NLQ-2-Cypher-data-cleaned.csv"
output_path = "/kaggle/working/{}-NLQ2Cypher".format(model_name.split('/')[-1].strip())

######## Configuration for bartpho ########
bartpho_config = TrainingConfig(
    model_name=model_name,
    output_dir=output_path,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    max_input_length=512,
    max_target_length=128,
    num_beams=5,
    max_new_tokens=128,
    eval_strategy="epoch", # use steps in main run
    save_strategy="epoch", # use steps in main run
    warmup_steps=270,
    eval_steps=135,
    save_steps = 135,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
    logging_dir="/kaggle/working/logs",
    logging_steps=50, # full run 100 steps
    fp16=True,
    seed=42,
    disable_tqdm=False,
)

######## Initialize Trainer ########
bartpho_trainer = NLQ2CypherTrainer(bartpho_config)

######## Setup model and tokenizer ########
bartpho_trainer.setup_model_and_tokenizer()

######## Load dataset && preprocessing ########
df = pd.read_csv(input_path, encoding="utf-8")
print("------ Dataset info ------")
print(df.info())
print("--------------------------")

####### Train - Val - Test splitting ########
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)
del temp_df

######## Prepare datasets ########

schema_text = """[N]
- DRUG: Tên chế phẩm, vắc xin, sinh phẩm.
- CHEMICAL: Hóa chất, hoạt chất, tá dược, thuốc thử.
- DISEASE: Bệnh lý hoặc triệu chứng lâm sàng.
- ORGANISM: Vi khuẩn, virus, dòng tế bào.
- TEST_METHOD: Kỹ thuật kiểm nghiệm.
- STANDARD: Chỉ số kỹ thuật (pH, nồng độ, hiệu giá,...).
- STORAGE_CONDITION: Điều kiện môi trường lưu giữ.
- PRODUCTION_METHOD: Công nghệ bào chế/sản xuất.

[R]
- TREATS: Thuốc/Hoạt chất điều trị Bệnh.
- CONTAINS: Thành phần/Tá dược có trong Thuốc.
- TARGETS: Hoạt chất tác động lên Vi sinh vật/Cơ quan.
- HAS_STANDARD: Thực thể có Chỉ số kỹ thuật/Tiêu chuẩn.
- TESTED_BY: Thuốc được kiểm nghiệm bằng Phương pháp.
- REQUIRES: Phương pháp cần có Hóa chất/Thuốc thử/Thiết bị.
- PRODUCED_BY: Thuốc được sản xuất bởi Phương pháp/Vi sinh vật.
- STORED_AT: Thuốc được bảo quản tại Điều kiện môi trường.
"""

train_dataset, val_dataset, test_dataset = prepare_datasets(
    train_df,
    val_df,
    test_df,
    tokenizer=bartpho_trainer.tokenizer,
    schema_text=schema_text,
    max_input_length=bartpho_config.max_input_length,
    max_target_length=bartpho_config.max_target_length
)
print(f"Dataset sizes after tokenization:")
print(f"Train: {len(train_dataset)}")
print(f"Validation: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")
print(f"\nSteps per epoch: {len(train_dataset) // (bartpho_config.per_device_train_batch_size * bartpho_config.gradient_accumulation_steps)}")
print(f"Validate every {bartpho_config.eval_steps} steps.")
print(f"Save every {bartpho_config.save_steps} steps.")

Loading model and tokenizer: vinai/bartpho-syllable


config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model loaded with 395,817,984 parameters
------ Dataset info ------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5412 entries, 0 to 5411
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5412 non-null   int64 
 1   question    5412 non-null   object
 2   answer      5412 non-null   object
dtypes: int64(1), object(2)
memory usage: 127.0+ KB
None
--------------------------
Train: 4329 samples, Val: 542 samples, Test: 541 samples
Original dataset columns: ['Unnamed: 0', 'question', 'answer']
Tokenizing datasets...


Tokenizing train dataset:   0%|          | 0/4329 [00:00<?, ? examples/s]

Tokenizing validation dataset:   0%|          | 0/542 [00:00<?, ? examples/s]

Tokenizing test dataset:   0%|          | 0/541 [00:00<?, ? examples/s]

Processed dataset columns: ['input_ids', 'attention_mask', 'labels']
Dataset shapes - Train: 4329, Val: 542, Test: 541
Dataset sizes after tokenization:
Train: 4329
Validation: 542
Test: 541

Steps per epoch: 270
Validate every 135 steps.
Save every 135 steps.


# Training

In [7]:
print("Starting training...")
train_result = bartpho_trainer.train(train_dataset, val_dataset)
print("Training completed.")

Starting training...
Total training steps: 2700


Epoch,Training Loss,Validation Loss,Exact Match
1,0.1825,0.101673,0.367159
2,0.0885,0.052039,0.535055
3,0.0534,0.043273,0.601476
4,0.0512,0.041139,0.618081
5,0.0403,0.036428,0.618081
6,0.0366,0.035021,0.653137
7,0.0277,0.035049,0.656827
8,0.0253,0.034296,0.658672
9,0.024,0.034071,0.651292
10,0.0227,0.034226,0.656827


Exact Match Score: 0.3672 (199/542)
Exact Match Score: 0.5351 (290/542)
Exact Match Score: 0.6015 (326/542)
Exact Match Score: 0.6181 (335/542)
Exact Match Score: 0.6181 (335/542)
Exact Match Score: 0.6531 (354/542)
Exact Match Score: 0.6568 (356/542)
Exact Match Score: 0.6587 (357/542)
Exact Match Score: 0.6513 (353/542)
Exact Match Score: 0.6568 (356/542)


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed.


## Evaluation

In [8]:
test_result = bartpho_trainer.evaluate(test_dataset)

# Save the test results
import json
with open("/kaggle/working/test_results.json", "w") as f:
    json.dump(test_result, f, indent=4)

# Generate and save predictions
print("\nGenerating predictions on test set...")
predictions_output = bartpho_trainer.trainer.predict(test_dataset)
predicted_ids = predictions_output.predictions

# FIX: Clip predictions to valid vocabulary range
vocab_size = bartpho_trainer.tokenizer.vocab_size
predicted_ids = np.clip(predicted_ids, 0, vocab_size - 1)

# Decode predictions
predicted_texts = bartpho_trainer.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

# Get ground truth labels
labels = predictions_output.label_ids
labels = np.where(labels != -100, labels, bartpho_trainer.tokenizer.pad_token_id)

# FIX: Clip labels to valid vocabulary range
labels = np.clip(labels, 0, vocab_size - 1)

# Decode labels
ground_truth_texts = bartpho_trainer.tokenizer.batch_decode(labels, skip_special_tokens=True)

# Save predictions to CSV
results_df = pd.DataFrame({
    'prediction': predicted_texts,
    'ground_truth': ground_truth_texts,
    'match': [pred.strip() == gt.strip() for pred, gt in zip(predicted_texts, ground_truth_texts)]
})
results_df.to_csv("/kaggle/working/test_predictions.csv", index=False, encoding="utf-8")
print(f"Saved {len(results_df)} predictions to test_predictions.csv")
print(f"Exact match rate: {results_df['match'].mean():.2%}")

# Show some sample predictions
print("\n=== Sample Test Predictions ===")
for i in range(min(3, len(results_df))):
    print(f"\nExample {i+1}:")
    print(f"Prediction: {results_df.iloc[i]['prediction']}")
    print(f"Ground Truth: {results_df.iloc[i]['ground_truth']}")
    print(f"Match: {results_df.iloc[i]['match']}")

Evaluating on test dataset...


Exact Match Score: 0.6599 (357/541)

Generating predictions on test set...
Exact Match Score: 0.6599 (357/541)
Saved 541 predictions to test_predictions.csv
Exact match rate: 65.99%

=== Sample Test Predictions ===

Example 1:
Prediction: MATCH (d:D)-[:TESTEDBY](t:TESTMETHOD) WHERE toLower(d.id) CONTAINS "dành dành (quả)" RETURN t.id
Ground Truth: MATCH (d:D)-[:TESTEDBY](tm:TESTMETHOD) WHERE toLower(d.id) CONTAINS "dành dành (quả)" RETURN tm.id
Match: False

Example 2:
Prediction: MATCH (t:TESTMETHOD)-[:REQUIRES](c:CHEMICAL) WHERE toLower(t.id) CONTAINS "định tính a" RETURN c.id
Ground Truth: MATCH (t:TESTMETHOD)-[:REQUIRES](c:CHEMICAL) WHERE toLower(t.id) CONTAINS "định tính a" RETURN c.id
Match: True

Example 3:
Prediction: MATCH (t:TESTMETHOD)-[:REQUIRES](c:CHEMICAL) WHERE toLower(t.id) CONTAINS "rt-pcr" RETURN c.id
Ground Truth: MATCH (t:TESTMETHOD)-[:REQUIRES](c:CHEMICAL) WHERE toLower(t.id) CONTAINS "rt-pcr" RETURN c.id
Match: True
