In [45]:
!pip install rouge_score -q

In [46]:
!pip install --upgrade transformers -q

In [47]:
#import th∆∞ vi·ªán
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    #AdamW,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from torch.optim import AdamW
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset
import json
import os
from tqdm import tqdm
import logging
from typing import Dict, List, Optional, Tuple
import wandb
from rouge_score import rouge_scorer
import warnings

warnings.filterwarnings('ignore')


In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [49]:
# C·∫•u h√¨nh logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [50]:
def setup_config():
    """T·∫≠p trung to√†n b·ªô c·∫•u h√¨nh v√†o m·ªôt n∆°i."""
    config = {
        "task_type": "summarization",
        "csv_file_path": "/kaggle/input/all-data/data_fireant_0407_final.xlsx - Sheet1.tsv",
        "num_epochs": 3,
        "batch_size": 2,
        "learning_rate": 3e-5,
        "model_name": "VietAI/vit5-base",
        "output_dir": "./vit5_summarization_finetuned",
        "max_input_length": 1024,
        "max_target_length": 256,
        "validation_split_size": 0.15,
        "random_state": 42,
        # New parameters to reduce repetition
        "repetition_penalty": 1.2,
        "no_repeat_ngram_size": 3,
        "length_penalty": 1.0,
        "num_beams": 4,
        "early_stopping": True,
        "do_sample": False,  # Set to True for more diverse outputs
        "temperature": 1.0,
        "top_k": 50,
        "top_p": 0.95
    }
    logger.info(f"üìä C·∫•u h√¨nh ƒë√£ ƒë∆∞·ª£c thi·∫øt l·∫≠p: {config}")
    return config

In [51]:
def load_model_and_tokenizer(model_name: str):
    """T·∫£i pre-trained model v√† tokenizer."""
    logger.info(f"üîÑ ƒêang t·∫£i model v√† tokenizer: '{model_name}'...")
    
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    logger.info(f"‚úÖ T·∫£i th√†nh c√¥ng model '{model_name}' tr√™n device '{device}'")
    return model, tokenizer, device

In [None]:
def load_data_from_tsv(file_path: str) -> list:
    """
    T·∫£i d·ªØ li·ªáu t·ª´ file TSV v√† chuy·ªÉn ƒë·ªïi th√†nh ƒë·ªãnh d·∫°ng c·∫ßn thi·∫øt.

    Args:
        file_path: ƒê∆∞·ªùng d·∫´n ƒë·∫øn file TSV.

    Returns:
        M·ªôt list c√°c dictionary, m·ªói dict ch·ª©a 'input' v√† 'target'.
    """

    try:
        logger.info(f"Reading data from file: {file_path}")
        df = pd.read_csv(file_path, sep='\t')

        # Check for required columns
        if 'content' not in df.columns or 'summary' not in df.columns:
            raise ValueError("Error: TSV file must contain 'content' and 'summary' columns.")

        # Rename columns to match format {'input': ..., 'target': ...}
        df = df.rename(columns={'content': 'input', 'summary': 'target'})
        
        # Remove rows with empty values in input or target columns
        df.dropna(subset=['input', 'target'], inplace=True)
        
        # Clean the data - remove extra whitespace and ensure proper format
        df['input'] = df['input'].astype(str).str.strip()
        df['target'] = df['target'].astype(str).str.strip()
        
        # Filter out very short summaries that might cause repetition issues
        df = df[df['target'].str.len() > 10]
        df = df[df['input'].str.len() > 50]

        logger.info(f"‚úÖ Successfully read {len(df)} samples from TSV file.")
        return df.to_dict('records')

    except FileNotFoundError:
        logger.error(f"‚ùå File not found at path: {file_path}")
        raise
    except Exception as e:
        logger.error(f"‚ùå Error reading TSV file: {e}")
        raise
    


In [None]:
def prepare_datasets(config: dict, tokenizer: T5Tokenizer):
    """Load, split and preprocess tokenize data."""
    logger.info("üõ†Ô∏è  Starting data preparation...")

    # Load and split data
    all_data = load_data_from_tsv(config['csv_file_path'])
    train_data_list, eval_data_list = train_test_split(
        all_data,
        test_size=config['validation_split_size'],
        random_state=config['random_state']
    )
    train_dataset = HFDataset.from_pandas(pd.DataFrame(train_data_list))
    eval_dataset = HFDataset.from_pandas(pd.DataFrame(eval_data_list))
    logger.info(f"‚úÖ Data split: {len(train_dataset)} training samples, {len(eval_dataset)} validation samples.")

    # Preprocessing function
    task_prefix = "summarize: "
    def preprocess_function(examples):
        inputs = [task_prefix + doc for doc in examples["input"]]
        model_inputs = tokenizer(
            inputs, 
            max_length=config['max_input_length'], 
            truncation=True, 
            padding="max_length"
        )
        
        # Process targets with proper handling
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples["target"], 
                max_length=config['max_target_length'], 
                truncation=True, 
                padding="max_length"
            )
        
        # Replace padding token id with -100 for loss calculation
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] 
            for label in labels["input_ids"]
        ]
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Tokenize
    tokenized_train_dataset = train_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=train_dataset.column_names
    )
    tokenized_eval_dataset = eval_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=eval_dataset.column_names
    )
    logger.info("‚úÖ Data tokenized successfully.")
    
    return tokenized_train_dataset, tokenized_eval_dataset
    

In [54]:
def compute_metrics(eval_pred):
    """Compute ROUGE metrics for evaluation."""
    predictions, labels = eval_pred
    
    # Decode predictions and labels
    tokenizer = T5Tokenizer.from_pretrained("VietAI/vit5-base", legacy=False)
    
    # Replace -100 with pad token id for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

In [None]:
def setup_trainer(model, tokenizer, config, train_dataset, eval_dataset):
    """Configure and initialize Hugging Face Trainer with improved settings."""
    logger.info("‚öôÔ∏è  Configuring Hugging Face Trainer...")
    
    training_args = TrainingArguments(
        output_dir=config['output_dir'],
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"{config['output_dir']}/logs",
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss", 
        greater_is_better=False,
        fp16=torch.cuda.is_available(),
        report_to="none",
        # Additional parameters to improve training stability
        gradient_accumulation_steps=1,
        dataloader_pin_memory=False,
        remove_unused_columns=True,
    )

    data_collator = DataCollatorForSeq2Seq(
        tokenizer, 
        model=model, 
        padding=True,
        return_tensors="pt"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=None,  # We'll compute metrics separately
    )
    
    return trainer



In [56]:
def generate_summary_with_improved_settings(model, tokenizer, input_text, config, device):
    """Generate summary with improved settings to reduce repetition."""
    
    # Prepare input
    input_text = "summarize: " + input_text
    input_ids = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=config['max_input_length'],
        truncation=True,
        padding=True
    ).input_ids.to(device)
    
    # Generate with improved parameters
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=config['max_target_length'],
            min_length=30,  # Ensure minimum length
            num_beams=config['num_beams'],
            repetition_penalty=config['repetition_penalty'],
            no_repeat_ngram_size=config['no_repeat_ngram_size'],
            length_penalty=config['length_penalty'],
            early_stopping=config['early_stopping'],
            do_sample=config['do_sample'],
            temperature=config['temperature'] if config['do_sample'] else None,
            top_k=config['top_k'] if config['do_sample'] else None,
            top_p=config['top_p'] if config['do_sample'] else None,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the generated summary
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_summary

In [None]:
def run_training_pipeline():
    """
    Main function to orchestrate the entire fine-tuning process.
    """
    logger.info("üöÄ STARTING VIT5 FINE-TUNING PIPELINE")
    print("=" * 60)

    config = setup_config()
    model, tokenizer, device = load_model_and_tokenizer(config['model_name'])
    train_ds, eval_ds = prepare_datasets(config, tokenizer)
    trainer = setup_trainer(model, tokenizer, config, train_ds, eval_ds)
    
    logger.info("üéØ Starting fine-tuning process...")
    trainer.train()
    logger.info("‚úÖ Fine-tuning completed!")
    
    final_model_path = f"{config['output_dir']}/final_model"
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    logger.info(f"üíæ Best model saved at: {final_model_path}")

    logger.info("üìà Evaluating model on validation set...")
    eval_results = trainer.evaluate()
    logger.info(f"   - Eval Loss: {eval_results.get('eval_loss', 'N/A')}")
    logger.info(f"   - Eval Runtime: {eval_results.get('eval_runtime', 'N/A')}s")

    logger.info("\nüß™ Testing fine-tuned model:")
    test_input = """ƒê·ªãa ·ªëc First Real d·ª± ki·∫øn ph√°t h√†nh h∆°n 6,4 tri·ªáu c·ªï phi·∫øu th∆∞·ªüng cho c·ªï ƒë√¥ng nh·∫±m tƒÉng v·ªën ƒëi·ªÅu l·ªá, ng√†y ƒëƒÉng k√Ω cu·ªëi c√πng ƒë·ªÉ ph√¢n b·ªï quy·ªÅn l√† 30/7/2025.
C√¥ng ty C·ªï ph·∫ßn ƒê·ªãa ·ªëc First Real (MCK: FIR, s√†n HoSE) v·ª´a c√≥ vƒÉn b·∫£n th√¥ng b√°o v·ªÅ ph√°t h√†nh c·ªï phi·∫øu ƒë·ªÉ tƒÉng v·ªën c·ªï ph·∫ßn t·ª´ ngu·ªìn v·ªën ch·ªß s·ªü h·ªØu.
Theo ƒë√≥, ƒê·ªãa ·ªëc First Real d·ª± ki·∫øn ph√°t h√†nh h∆°n 6,4 tri·ªáu c·ªï phi·∫øu cho c·ªï ƒë√¥ng hi·ªán h·ªØu v·ªõi t·ª∑ l·ªá th·ª±c hi·ªán quy·ªÅn 10:1, t·ª©c c·ªï ƒë√¥ng s·ªü h·ªØu 1 c·ªï phi·∫øu ƒë∆∞·ª£c h∆∞·ªüng 1 quy·ªÅn, c·ª© 10 quy·ªÅn s·∫Ω ƒë∆∞·ª£c nh·∫≠n 1 c·ªï phi·∫øu m·ªõi. Ng√†y ƒëƒÉng k√Ω cu·ªëi c√πng ƒë·ªÉ ph√¢n b·ªï quy·ªÅn l√† 30/7/2025.
T·ªïng gi√° tr·ªã ph√°t h√†nh t√≠nh theo m·ªánh gi√° l√† h∆°n 64,2 t·ª∑ ƒë·ªìng. Ngu·ªìn v·ªën th·ª±c hi·ªán ƒë∆∞·ª£c l·∫•y t·ª´ ngu·ªìn th·∫∑ng d∆∞ v·ªën c·ªï ph·∫ßn c·ªßa c√¥ng ty theo b√°o c√°o t√†i ch√≠nh nƒÉm 2024 ƒë√£ ki·ªÉm to√°n.
·∫¢nh minh h·ªça
N·∫øu ƒë·ª£t ph√°t h√†nh th√†nh c√¥ng, s·ªë l∆∞·ª£ng c·ªï phi·∫øu ƒë√£ ph√°t h√†nh c·ªßa ƒê·ªãa ·ªëc First Real s·∫Ω tƒÉng t·ª´ h∆°n 64,2 tri·ªáu c·ªï phi·∫øu l√™n g·∫ßn 70,7 tri·ªáu c·ªï phi·∫øu, t∆∞∆°ng ƒë∆∞∆°ng v·ªën ƒëi·ªÅu l·ªá tƒÉng t·ª´ g·∫ßn 642,5 t·ª∑ ƒë·ªìng l√™n g·∫ßn 706,7 t·ª∑ ƒë·ªìng.
ƒê∆∞·ª£c bi·∫øt, ph∆∞∆°ng √°n ph√°t h√†nh c·ªï phi·∫øu n√†y ƒë√£ ƒë∆∞·ª£c c·ªï ƒë√¥ng c·ªßa ƒê·ªãa ·ªëc First Real th√¥ng qua t·∫°i ƒê·∫°i h·ªôi ƒë·ªìng c·ªï ƒë√¥ng (ƒêHƒêCƒê) th∆∞·ªùng ni√™n 2025 ƒë∆∞·ª£c t·ªï ch·ª©c ng√†y 21/3/2025."""
        
    loaded_tokenizer = T5Tokenizer.from_pretrained(final_model_path, legacy=False)
    loaded_model = T5ForConditionalGeneration.from_pretrained(final_model_path).to(device)

    # Test with improved generation settings
    prediction = generate_summary_with_improved_settings(
        loaded_model, loaded_tokenizer, test_input, config, device
    )

    print("-" * 60)
    print(f"üìå Input: {test_input[:200]}...")
    print(f"üí° Improved Output: {prediction}")
    print("-" * 60)

if __name__ == "__main__":
    try:
        run_training_pipeline()
    except Exception as e:
        logger.error(f"‚ùå Critical error during execution: {e}")
        import traceback
        traceback.print_exc()


    



Map:   0%|          | 0/4684 [00:00<?, ? examples/s]

Map:   0%|          | 0/827 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.2346,1.127015
2,1.032,1.097187
3,0.9267,1.102287


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


------------------------------------------------------------
üìå Input: ƒê·ªãa ·ªëc First Real d·ª± ki·∫øn ph√°t h√†nh h∆°n 6,4 tri·ªáu c·ªï phi·∫øu th∆∞·ªüng cho c·ªï ƒë√¥ng nh·∫±m tƒÉng v·ªën ƒëi·ªÅu l·ªá, ng√†y ƒëƒÉng k√Ω cu·ªëi c√πng ƒë·ªÉ ph√¢n b·ªï quy·ªÅn l√† 30/7/2025.
C√¥ng ty C·ªï ph·∫ßn ƒê·ªãa ·ªëc First Real (MCK: FIR, ...
üí° Improved Output: ƒê·ªãa ·ªëc First Real d·ª± ki·∫øn ph√°t h√†nh h∆°n 6,4 tri·ªáu c·ªï phi·∫øu th∆∞·ªüng cho c·ªï ƒë√¥ng ƒë·ªÉ tƒÉng v·ªën ƒëi·ªÅu l·ªá, ng√†y ƒëƒÉng k√Ω cu·ªëi c√πng l√† 30/7/2025. T·ªïng gi√° tr·ªã ph√°t h√†nh l√† h∆°n 64,2 t·ª∑ ƒë·ªìng, ƒë∆∞·ª£c l·∫•y t·ª´ th·∫∑ng d∆∞ v·ªën c·ªï ph·∫ßn c·ªßa c√¥ng ty. Ph∆∞∆°ng √°n n√†y ƒë√£ ƒë∆∞·ª£c c·ªï ƒë√¥ng th√¥ng qua t·∫°i ƒê·∫°i h·ªôi ƒë·ªìng c·ªï ƒë√¥ng th∆∞·ªùng ni√™n nƒÉm 2015. D·ª± ki·∫øn s·ªë l∆∞·ª£ng c·ªï phi·∫øu ƒë√£ ph√°t h√†nh s·∫Ω tƒÉng t·ª´ g·∫ßn 642,5 t·ª∑ ƒë·ªìng l√™n g·∫ßn 706,7 t·ª∑ ƒë·ªìng.
------------------------------------------------------------


In [None]:
!zip -r vit5_summarization_finetuned.zip /kaggle/working/vit5_summarization_finetuned