In [1]:
# Install required packages
!pip install transformers
!pip install pyvi
!pip install pandas
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install torch
!pip install openpyxl

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsuite-0.5

In [2]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyvi import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    balanced_accuracy_score,
    f1_score,
    roc_auc_score,
    roc_curve, 
    precision_recall_curve,
    auc,
    average_precision_score,
    cohen_kappa_score,
    matthews_corrcoef,
    log_loss,
    hamming_loss,
    jaccard_score,
    top_k_accuracy_score
)
from sklearn.preprocessing import label_binarize
from sklearn.utils import resample
import logging
from datetime import datetime
import unicodedata
import zipfile
from IPython.display import FileLink, display

import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

In [3]:
# Set up logging
log_dir = '/kaggle/working/'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'training_title_sentiment_log_v1.txt')

In [4]:
# Remove existing log file if it exists
if os.path.exists(log_file):
    try:
        os.remove(log_file)
    except Exception as e:
        print(f"Could not remove existing log file: {e}")

# Configure logging
try:
    logging.getLogger().handlers.clear()
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.INFO)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.WARNING)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    logging.basicConfig(
        level=logging.INFO,
        handlers=[file_handler, console_handler],
        force=True
    )
    logging.info("Logging initialized successfully")
    print(f"Log file will be saved to: {log_file}")
except Exception as e:
    print(f"Failed to initialize logging to file: {e}. Using console logging only.")
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[console_handler]
    )

Log file will be saved to: /kaggle/working/training_title_sentiment_log_v1.txt


In [5]:
# Seed everything for reproducibility
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(86)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Configuration
EPOCHS = 10
BATCH_SIZE = 16
MAX_LEN = 120  
N_SPLITS = 5

In [6]:
# Phase 1: Load Data
def load_original_data(xlsx_path):
    original_df = pd.read_excel(xlsx_path)
    df_processed = original_df[['title', 'sentiment']].copy()
    logging.info(f"Original data loaded with shape: {original_df.shape}")
    assert original_df.equals(pd.read_excel(xlsx_path)), "Original data modified!"
    return df_processed

# Phase 2: Data Cleaning
def clean_data(df):
    # Check for NaN
    nan_rows = df[df['title'].isna() | df['sentiment'].isna()]
    if not nan_rows.empty:
        df = df.dropna(subset=['title', 'sentiment'])
        logging.info(f"Removed {len(nan_rows)} rows with NaN values")

    # Remove duplicate titles
    initial_rows = len(df)
    df = df.drop_duplicates(subset=['title'])
    logging.info(f"Removed {initial_rows - len(df)} duplicate titles")

    # Remove short titles
    df = df[df['title'].str.len() >= 5]
    logging.info(f"Removed {initial_rows - len(df)} titles shorter than 5 characters")

    # Ensure consistent sentiment labels
    valid_labels = {'Positive', 'Negative', 'Neutral'}
    invalid_labels = set(df['sentiment']) - valid_labels
    if invalid_labels:
        raise ValueError(f"Invalid sentiment labels found: {invalid_labels}")
    df['sentiment'] = df['sentiment'].str.strip()
    logging.info("Sentiment labels checked and stripped")
    return df

# Phase 3: Text Normalization
def normalize_text(text):
    text = unicodedata.normalize('NFC', str(text))
    text = ' '.join(text.split())
    return text

# Phase 4: Label Processing and Balancing
def process_labels(df):
    label2id = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
    id2label = {v: k for k, v in label2id.items()}
    df['sentiment'] = df['sentiment'].map(label2id)

    # Check class distribution
    distribution = df['sentiment'].value_counts().sort_index()
    logging.info(f"Class distribution before balancing: {distribution.to_dict()}")
    if distribution.min() / distribution.max() < 0.5:
        logging.warning("Significant class imbalance detected!")

    # Balance dataset using undersampling
    min_count = distribution.min()
    balanced_df = pd.concat([resample(df[df['sentiment'] == label], 
                           replace=False, n_samples=min_count, random_state=86)
                           for label in range(3)])
    balanced_df = balanced_df.sample(frac=1, random_state=86).reset_index(drop=True)
    logging.info(f"Class distribution after balancing: {balanced_df['sentiment'].value_counts().sort_index().to_dict()}")
    return balanced_df, label2id, id2label

# Phase 5: Tokenization
def tokenize_data(df, tokenizer):
    df['text'] = df['title'].apply(normalize_text)
    df['tokenized'] = df['text'].apply(lambda x: tokenizer.encode_plus(
        x, max_length=MAX_LEN, padding='max_length', truncation=True,
        return_tensors='pt', add_special_tokens=True)['input_ids'].squeeze().numpy())
    logging.info(f"Tokenization completed with max_length={MAX_LEN}")
    return df

In [7]:
# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_LEN):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['text']
        label = row['sentiment']

        encoding = self.tokenizer.encode_plus(
            text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt', add_special_tokens=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

In [8]:
# Model Definition
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=3):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.drop(pooled_output)
        return self.fc(output)

In [9]:
# Training and Evaluation Functions
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        _, preds = torch.max(outputs, dim=1)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return np.mean(losses), correct_predictions.double() / len(data_loader.dataset)

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            _, preds = torch.max(outputs, dim=1)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return np.mean(losses), correct_predictions.double() / len(data_loader.dataset)

def prepare_loaders(train_df, val_df, test_df, tokenizer):
    train_dataset = SentimentDataset(train_df, tokenizer)
    val_dataset = SentimentDataset(val_df, tokenizer)
    test_dataset = SentimentDataset(test_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, val_loader, test_loader

In [10]:
# Enhanced Evaluation Function with all metrics
def evaluate_model(model, data_loader, id2label):
    model.eval()
    all_preds = []
    all_targets = []
    all_probs = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(input_ids, attention_mask)
            probs = torch.nn.functional.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    # Use numeric labels instead of strings for metrics calculation
    classes = list(id2label.values())
    class_ids = list(id2label.keys())
    
    # ==================== PER-CLASS EVALUATION ====================
    print("\n" + "="*60)
    print("DETAILED PER-CLASS EVALUATION METRICS")
    print("="*60)
    
    # Convert to string labels only for display
    all_pred_labels = [id2label[p] for p in all_preds]
    all_true_labels = [id2label[t] for t in all_targets]
    
    report = classification_report(
        all_true_labels, 
        all_pred_labels, 
        target_names=classes,
        digits=4,
        output_dict=True
    )
    
    class_metrics = pd.DataFrame(report).transpose().drop(['accuracy', 'macro avg', 'weighted avg'])
    print("\nPer-class metrics:")
    print(class_metrics.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    # ==================== OVERALL EVALUATION ====================
    print("\n" + "="*60)
    print("COMPREHENSIVE OVERALL EVALUATION METRICS")
    print("="*60)
    
    # Use numeric labels (all_targets and all_preds) to calculate metrics
    accuracy = accuracy_score(all_targets, all_preds)
    balanced_accuracy = balanced_accuracy_score(all_targets, all_preds)
    f1_macro = f1_score(all_targets, all_preds, average='macro')
    f1_weighted = f1_score(all_targets, all_preds, average='weighted')
    kappa = cohen_kappa_score(all_targets, all_preds)
    mcc = matthews_corrcoef(all_targets, all_preds)
    lloss = log_loss(all_targets, all_probs, labels=class_ids)
    h_loss = hamming_loss(all_targets, all_preds)
    jaccard = jaccard_score(all_targets, all_preds, average='weighted')
    
    try:
        top2_acc = top_k_accuracy_score(all_targets, all_probs, k=2)
        top3_acc = top_k_accuracy_score(all_targets, all_probs, k=3)
    except:
        top2_acc = top3_acc = None
    
    # ROC AUC calculation
    try:
        if len(classes) == 2:
            roc_auc = roc_auc_score(all_targets, [p[1] for p in all_probs])
        else:
            y_true_bin = label_binarize(all_targets, classes=class_ids)
            roc_auc = roc_auc_score(y_true_bin, all_probs, multi_class='ovr')
    except Exception as e:
        print(f"Could not calculate ROC AUC: {str(e)}")
        roc_auc = None
    
    # Precision-Recall AUC
    try:
        if len(classes) == 2:
            precision, recall, _ = precision_recall_curve(all_targets, [p[1] for p in all_probs])
            pr_auc = auc(recall, precision)
        else:
            pr_auc = average_precision_score(
                label_binarize(all_targets, classes=class_ids),
                all_probs,
                average='macro'
            )
    except Exception as e:
        print(f"Could not calculate PR AUC: {str(e)}")
        pr_auc = None
    
    # Display overall metrics in a comprehensive table
    overall_metrics = {
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_accuracy,
        'Macro F1': f1_macro,
        'Weighted F1': f1_weighted,
        'Macro Precision': report['macro avg']['precision'],
        'Macro Recall': report['macro avg']['recall'],
        'Cohen Kappa': kappa,
        'Matthews Corr Coef': mcc,
        'Log Loss': lloss,
        'Hamming Loss': h_loss,
        'Jaccard Score': jaccard,
    }
    
    if roc_auc is not None:
        overall_metrics['ROC AUC'] = roc_auc
    if pr_auc is not None:
        overall_metrics['PR AUC'] = pr_auc
    if top2_acc is not None:
        overall_metrics['Top-2 Accuracy'] = top2_acc
        overall_metrics['Top-3 Accuracy'] = top3_acc
    
    overall_df = pd.DataFrame.from_dict(overall_metrics, orient='index', columns=['Value'])
    print("\nOverall metrics:")
    print(overall_df.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    # ==================== VISUALIZATIONS ====================
    print("\n" + "="*60)
    print("EVALUATION VISUALIZATIONS")
    print("="*60)
    
    # 1. Confusion matrix
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(all_true_labels, all_pred_labels, labels=classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    confusion_matrix_path = os.path.join(log_dir, 'confusion_matrix.png')
    plt.savefig(confusion_matrix_path)
    plt.close()
    print(f"\nConfusion matrix saved to {confusion_matrix_path}")
    
    # 2. Precision-Recall Curve
    plt.figure(figsize=(12, 10))
    if len(classes) == 2:
        precision, recall, _ = precision_recall_curve(all_targets, [p[1] for p in all_probs])
        plt.plot(recall, precision, lw=2, label=f'PR Curve (AUC = {pr_auc:.2f})')
    else:
        for i, class_name in enumerate(classes):
            precision, recall, _ = precision_recall_curve(
                (np.array(all_targets) == class_ids[i]).astype(int),
                np.array(all_probs)[:, i]
            )
            plt.plot(recall, precision, lw=2, label=f'{class_name}')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='best')
    pr_curve_path = os.path.join(log_dir, 'precision_recall_curve.png')
    plt.savefig(pr_curve_path)
    plt.close()
    print(f"Precision-Recall curve saved to {pr_curve_path}")
    
    # 3. ROC Curve (for binary or multiclass)
    if roc_auc is not None:
        plt.figure(figsize=(12, 10))
        if len(classes) == 2:
            fpr, tpr, _ = roc_curve(all_targets, [p[1] for p in all_probs])
            plt.plot(fpr, tpr, lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
        else:
            y_true_bin = label_binarize(all_targets, classes=class_ids)
            for i, class_name in enumerate(classes):
                fpr, tpr, _ = roc_curve(y_true_bin[:, i], np.array(all_probs)[:, i])
                plt.plot(fpr, tpr, lw=2, label=f'{class_name}')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='best')
        roc_curve_path = os.path.join(log_dir, 'roc_curve.png')
        plt.savefig(roc_curve_path)
        plt.close()
        print(f"ROC curve saved to {roc_curve_path}")
    
    # Log all metrics
    logging.info("\n=== DETAILED PER-CLASS EVALUATION METRICS ===")
    logging.info("\nPer-class metrics:\n" + class_metrics.to_markdown(tablefmt="grid", floatfmt=".4f"))
    logging.info("\n=== COMPREHENSIVE OVERALL EVALUATION METRICS ===")
    logging.info("\nOverall metrics:\n" + overall_df.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    return {
        'class_metrics': class_metrics,
        'overall_metrics': overall_metrics,
        'confusion_matrix': cm,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'visualizations': {
            'confusion_matrix': confusion_matrix_path,
            'pr_curve': pr_curve_path,
            'roc_curve': roc_curve_path if roc_auc is not None else None
        }
    }

In [11]:
# Train and Evaluate with Cross-Validation
def train_and_evaluate(df, test_df, tokenizer):
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=86)
    best_accuracy = 0
    best_model_path = None
    
    for fold, (train_index, val_index) in enumerate(skf.split(df, df['sentiment'])):
        print(f'\nFold {fold + 1}/{N_SPLITS}')
        train_df = df.iloc[train_index].reset_index(drop=True)
        val_df = df.iloc[val_index].reset_index(drop=True)
        train_loader, val_loader, _ = prepare_loaders(train_df, val_df, test_df, tokenizer)
        
        model = SentimentClassifier().to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = AdamW(model.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS)
        
        for epoch in range(EPOCHS):
            train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
            val_loss, val_acc = eval_model(model, val_loader, criterion, device)
            logging.info(f"Fold {fold+1}, Epoch {epoch+1}/{EPOCHS} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
            print(f"Epoch {epoch+1}/{EPOCHS} - Val Acc: {val_acc:.4f}")
            
            if val_acc > best_accuracy:
                best_accuracy = val_acc
                best_model_path = os.path.join(log_dir, f'PhoBERT_sentiment_temp.bin')
                torch.save(model.state_dict(), best_model_path)
                logging.info(f"Saved best model at fold {fold+1}, epoch {epoch+1} with accuracy {val_acc:.4f}")
    
    model.load_state_dict(torch.load(best_model_path))
    os.remove(best_model_path)
    final_model_path = os.path.join(log_dir, 'PhoBERT_title_sentiment_v1.bin')
    torch.save(model.state_dict(), final_model_path)
    logging.info(f"Final best model saved as PhoBERT_title_sentiment_v1.bin with accuracy {best_accuracy:.4f}")
    return model

In [12]:
# Inference Function
def predict_sentiment(text, model, tokenizer, id2label):
    dataset = SentimentDataset(pd.DataFrame({'text': [text], 'sentiment': [0]}), tokenizer)
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            _, pred = torch.max(outputs, dim=1)
    return id2label[pred.item()]

In [13]:
# Zip output files for download
def zip_and_download_output_files():
    output_zip = os.path.join(log_dir, 'output_files.zip')
    output_files = [
        'training_title_sentiment_log_v1.txt',
        'data_processed_for_title_sentiment.csv',
        'PhoBERT_title_sentiment_v1.bin',
        'confusion_matrix.png',
        'precision_recall_curve.png',
        'roc_curve.png'
    ]
    
    # Only include existing files
    existing_files = [f for f in output_files if os.path.exists(os.path.join(log_dir, f))]
    missing_files = set(output_files) - set(existing_files)
    
    if missing_files:
        logging.warning(f"Missing files: {missing_files}")
    
    try:
        with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file in existing_files:
                file_path = os.path.join(log_dir, file)
                zipf.write(file_path, file)
                logging.info(f"Added {file} to zip archive")
        
        if os.path.exists(output_zip):
            logging.info(f"Zip file created at {output_zip}")
            
            # Automatic download
            print("\n" + "="*60)
            print("AUTOMATICALLY DOWNLOADING OUTPUT FILES")
            print("="*60)
            
            # For Kaggle
            if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
                print("In Kaggle environment, please download manually:")
                display(FileLink('output_files.zip'))
            # For Google Colab
            else:
                print("Downloading output files automatically...")
                from google.colab import files
                files.download(output_zip)
            
            return output_zip
        else:
            logging.error("Failed to create zip file")
            return None
    except Exception as e:
        logging.error(f"Error creating zip: {e}")
        return None

In [14]:
# Main Execution
if __name__ == "__main__":
    try:
        # Load and preprocess data
        xlsx_path = '/kaggle/input/data-title-sentiment/Data_title_sentiment.xlsx'
        if not os.path.exists(xlsx_path):
            raise FileNotFoundError(f"File {xlsx_path} not found!")
        print(f"Found data file at {xlsx_path}")
        print("Loading data...")
        df_processed = load_original_data(xlsx_path)
        print("Cleaning data...")
        df_processed = clean_data(df_processed)
        print("Processing labels...")
        df_processed, label2id, id2label = process_labels(df_processed)
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
        print("Tokenizer loaded successfully.")
        print("Tokenizing data...")
        df_processed = tokenize_data(df_processed, tokenizer)

        # Save Processed Data
        processed_data_path = os.path.join(log_dir, 'data_processed_for_title_sentiment.csv')
        df_processed.to_csv(processed_data_path, index=False)
        logging.info(f"Saved processed data with shape: {df_processed.shape}")

        # Statistics and Log
        summary_lengths = df_processed['text'].str.len()
        token_lengths = [len(t) for t in df_processed['tokenized']]
        logging.info(f"Summary length stats: Min={summary_lengths.min()}, Max={summary_lengths.max()}, Mean={summary_lengths.mean():.2f}")
        logging.info(f"Token length stats: Min={min(token_lengths)}, Max={max(token_lengths)}, Mean={np.mean(token_lengths):.2f}")
        logging.info(f"Processing config: max_len={MAX_LEN}, epochs={EPOCHS}, batch_size={BATCH_SIZE}")

        # Split into train+val and test
        train_val_df, test_df = train_test_split(df_processed, test_size=0.2, stratify=df_processed['sentiment'], random_state=86)
        logging.info(f"Data split: Train+Val={len(train_val_df)}, Test={len(test_df)}")

        # Train and evaluate
        print("Starting training...")
        model = train_and_evaluate(train_val_df, test_df, tokenizer)

        # Evaluate on test set
        print("Evaluating model...")
        _, _, test_loader = prepare_loaders(train_val_df, train_val_df, test_df, tokenizer)
        evaluation_results = evaluate_model(model, test_loader, id2label)

        # Test inference
        sample_text = "Apple ra mắt iPhone 17 với chip A19 vào tháng 9."
        predicted_sentiment = predict_sentiment(sample_text, model, tokenizer, id2label)
        print(f"\nSample text: {sample_text}")
        print(f"Predicted sentiment: {predicted_sentiment}")
        logging.info(f"Inference test: Text='{sample_text}', Predicted='{predicted_sentiment}'")

        # Zip and automatically download output files
        print("\nPreparing output files for download...")
        zip_and_download_output_files()

    except Exception as e:
        logging.error(f"Error in main execution: {e}", exc_info=True)
        raise

Found data file at /kaggle/input/data-title-sentiment/Data_title_sentiment.xlsx
Loading data...




Cleaning data...
Processing labels...
Loading tokenizer...


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer loaded successfully.
Tokenizing data...
Starting training...

Fold 1/5


2025-07-01 20:02:21.137934: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751400141.364615      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751400141.425870      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch 1/10 - Val Acc: 0.5762
Epoch 2/10 - Val Acc: 0.6320
Epoch 3/10 - Val Acc: 0.6208
Epoch 4/10 - Val Acc: 0.6134
Epoch 5/10 - Val Acc: 0.6320
Epoch 6/10 - Val Acc: 0.5985
Epoch 7/10 - Val Acc: 0.6171
Epoch 8/10 - Val Acc: 0.6115
Epoch 9/10 - Val Acc: 0.6022
Epoch 10/10 - Val Acc: 0.6022

Fold 2/5
Epoch 1/10 - Val Acc: 0.5688
Epoch 2/10 - Val Acc: 0.6208
Epoch 3/10 - Val Acc: 0.6022
Epoch 4/10 - Val Acc: 0.5967
Epoch 5/10 - Val Acc: 0.6227
Epoch 6/10 - Val Acc: 0.6413
Epoch 7/10 - Val Acc: 0.6338
Epoch 8/10 - Val Acc: 0.6264
Epoch 9/10 - Val Acc: 0.6320
Epoch 10/10 - Val Acc: 0.6208

Fold 3/5
Epoch 1/10 - Val Acc: 0.5855
Epoch 2/10 - Val Acc: 0.6097
Epoch 3/10 - Val Acc: 0.5911
Epoch 4/10 - Val Acc: 0.5985
Epoch 5/10 - Val Acc: 0.6004
Epoch 6/10 - Val Acc: 0.5948
Epoch 7/10 - Val Acc: 0.5967
Epoch 8/10 - Val Acc: 0.5911
Epoch 9/10 - Val Acc: 0.5836
Epoch 10/10 - Val Acc: 0.5836

Fold 4/5
Epoch 1/10 - Val Acc: 0.5829
Epoch 2/10 - Val Acc: 0.5829
Epoch 3/10 - Val Acc: 0.6052
Epoch 4/10




Confusion matrix saved to /kaggle/working/confusion_matrix.png
Precision-Recall curve saved to /kaggle/working/precision_recall_curve.png
ROC curve saved to /kaggle/working/roc_curve.png

Sample text: Apple ra mắt iPhone 17 với chip A19 vào tháng 9.
Predicted sentiment: Positive

Preparing output files for download...

AUTOMATICALLY DOWNLOADING OUTPUT FILES
In Kaggle environment, please download manually:
