In [1]:
# Install required packages
!pip install transformers
!pip install pyvi
!pip install pandas
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install torch
!pip install openpyxl

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsui

In [5]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyvi import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    balanced_accuracy_score,
    f1_score,
    roc_auc_score,
    roc_curve, 
    precision_recall_curve,
    auc,
    average_precision_score,
    cohen_kappa_score,
    matthews_corrcoef,
    log_loss,
    hamming_loss,
    jaccard_score,
    top_k_accuracy_score
)
from sklearn.preprocessing import label_binarize
from sklearn.utils import resample
import logging
from datetime import datetime
import unicodedata
import zipfile
from IPython.display import FileLink, display

import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import optuna

In [6]:
# Set up logging
log_dir = '/kaggle/working/'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'training_summary_sentiment_log_edit_1807.txt')

In [7]:
# Remove existing log file if it exists
if os.path.exists(log_file):
    try:
        os.remove(log_file)
    except Exception as e:
        print(f"Could not remove existing log file: {e}")

# Configure logging
try:
    logging.getLogger().handlers.clear()
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.INFO)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.WARNING)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    logging.basicConfig(
        level=logging.INFO,
        handlers=[file_handler, console_handler],
        force=True
    )
    logging.info("Logging initialized successfully")
    print(f"Log file will be saved to: {log_file}")
except Exception as e:
    print(f"Failed to initialize logging to file: {e}. Using console logging only.")
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[console_handler]
    )

Log file will be saved to: /kaggle/working/training_summary_sentiment_log_edit_1807.txt


In [8]:
# Seed everything for reproducibility
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(86)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Configuration
MAX_LEN = 256 
N_SPLITS = 3

In [9]:
# Phase 1: Load Data
def load_original_data(xlsx_path):
    original_df = pd.read_excel(xlsx_path)
    df_processed = original_df[['summary', 'sentiment']].copy()
    logging.info(f"Original data loaded with shape: {original_df.shape}")
    assert original_df.equals(pd.read_excel(xlsx_path)), "Original data modified!"
    return df_processed

# Phase 2: Data Cleaning
def clean_data(df):
    nan_rows = df[df['summary'].isna() | df['sentiment'].isna()]
    if not nan_rows.empty:
        df = df.dropna(subset=['summary', 'sentiment'])
        logging.info(f"Removed {len(nan_rows)} rows with NaN values")
    initial_rows = len(df)
    df = df.drop_duplicates(subset=['summary'])
    logging.info(f"Removed {initial_rows - len(df)} duplicate summaries")
    df = df[df['summary'].str.len() >= 5]
    logging.info(f"Removed {initial_rows - len(df)} summaries shorter than 5 characters")
    valid_labels = {'Positive', 'Negative', 'Neutral'}
    invalid_labels = set(df['sentiment']) - valid_labels
    if invalid_labels:
        raise ValueError(f"Invalid sentiment labels found: {invalid_labels}")
    df['sentiment'] = df['sentiment'].str.strip()
    logging.info("Sentiment labels checked and stripped")
    return df

# Phase 3: Text Normalization
def normalize_text(text):
    text = unicodedata.normalize('NFC', str(text))
    text = ' '.join(text.split())
    return text

# Phase 4: Label Processing (with class weighting)
def process_labels(df):
    label2id = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
    id2label = {v: k for k, v in label2id.items()}
    df['sentiment'] = df['sentiment'].map(label2id)
    distribution = df['sentiment'].value_counts().sort_index()
    logging.info(f"Class distribution: {distribution.to_dict()}")
    if distribution.min() / distribution.max() < 0.5:
        logging.warning("Significant class imbalance detected!")
    class_weights = torch.tensor([1.0 / count for count in distribution]).to(device)
    return df, label2id, id2label, class_weights

# Phase 5: Tokenization
def tokenize_data(df, tokenizer):
    df['text'] = df['summary'].apply(normalize_text)
    df['tokenized'] = df['text'].apply(lambda x: tokenizer.encode_plus(
        x, max_length=MAX_LEN, padding='max_length', truncation=True,
        return_tensors='pt', add_special_tokens=True)['input_ids'].squeeze().numpy())
    logging.info(f"Tokenization completed with max_length={MAX_LEN}")
    return df
# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_LEN):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['text']
        label = row['sentiment']

        encoding = self.tokenizer.encode_plus(
            text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt', add_special_tokens=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# Model Definition
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=3, dropout_rate=0.3):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.drop = nn.Dropout(p=dropout_rate)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.drop(pooled_output)
        return self.fc(output)



In [11]:
# Training and Evaluation Functions
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        _, preds = torch.max(outputs, dim=1)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return np.mean(losses), correct_predictions.double() / len(data_loader.dataset)

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            _, preds = torch.max(outputs, dim=1)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return np.mean(losses), correct_predictions.double() / len(data_loader.dataset)

def prepare_loaders(train_df, val_df, test_df, tokenizer, batch_size):
    train_dataset = SentimentDataset(train_df, tokenizer)
    val_dataset = SentimentDataset(val_df, tokenizer)
    test_dataset = SentimentDataset(test_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, val_loader, test_loader

In [12]:
# Enhanced Evaluation Function with all metrics
def evaluate_model(model, data_loader, id2label):
    model.eval()
    all_preds = []
    all_targets = []
    all_probs = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(input_ids, attention_mask)
            probs = torch.nn.functional.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    # Use numeric labels instead of strings for metrics calculation
    classes = list(id2label.values())
    class_ids = list(id2label.keys())
    
    # ==================== PER-CLASS EVALUATION ====================
    print("\n" + "="*60)
    print("DETAILED PER-CLASS EVALUATION METRICS")
    print("="*60)
    
    # Convert to string labels only for display
    all_pred_labels = [id2label[p] for p in all_preds]
    all_true_labels = [id2label[t] for t in all_targets]
    
    report = classification_report(
        all_true_labels, 
        all_pred_labels, 
        target_names=classes,
        digits=4,
        output_dict=True
    )
    
    class_metrics = pd.DataFrame(report).transpose().drop(['accuracy', 'macro avg', 'weighted avg'])
    print("\nPer-class metrics:")
    print(class_metrics.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    # ==================== OVERALL EVALUATION ====================
    print("\n" + "="*60)
    print("COMPREHENSIVE OVERALL EVALUATION METRICS")
    print("="*60)
    
    # Use numeric labels (all_targets and all_preds) to calculate metrics
    accuracy = accuracy_score(all_targets, all_preds)
    balanced_accuracy = balanced_accuracy_score(all_targets, all_preds)
    f1_macro = f1_score(all_targets, all_preds, average='macro')
    f1_weighted = f1_score(all_targets, all_preds, average='weighted')
    kappa = cohen_kappa_score(all_targets, all_preds)
    mcc = matthews_corrcoef(all_targets, all_preds)
    lloss = log_loss(all_targets, all_probs, labels=class_ids)
    h_loss = hamming_loss(all_targets, all_preds)
    jaccard = jaccard_score(all_targets, all_preds, average='weighted')
    
    try:
        top2_acc = top_k_accuracy_score(all_targets, all_probs, k=2)
        top3_acc = top_k_accuracy_score(all_targets, all_probs, k=3)
    except:
        top2_acc = top3_acc = None
    
    # ROC AUC calculation
    try:
        if len(classes) == 2:
            roc_auc = roc_auc_score(all_targets, [p[1] for p in all_probs])
        else:
            y_true_bin = label_binarize(all_targets, classes=class_ids)
            roc_auc = roc_auc_score(y_true_bin, all_probs, multi_class='ovr')
    except Exception as e:
        print(f"Could not calculate ROC AUC: {str(e)}")
        roc_auc = None
    
    # Precision-Recall AUC
    try:
        if len(classes) == 2:
            precision, recall, _ = precision_recall_curve(all_targets, [p[1] for p in all_probs])
            pr_auc = auc(recall, precision)
        else:
            pr_auc = average_precision_score(
                label_binarize(all_targets, classes=class_ids),
                all_probs,
                average='macro'
            )
    except Exception as e:
        print(f"Could not calculate PR AUC: {str(e)}")
        pr_auc = None
    
    # Display overall metrics in a comprehensive table
    overall_metrics = {
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_accuracy,
        'Macro F1': f1_macro,
        'Weighted F1': f1_weighted,
        'Macro Precision': report['macro avg']['precision'],
        'Macro Recall': report['macro avg']['recall'],
        'Cohen Kappa': kappa,
        'Matthews Corr Coef': mcc,
        'Log Loss': lloss,
        'Hamming Loss': h_loss,
        'Jaccard Score': jaccard,
    }
    
    if roc_auc is not None:
        overall_metrics['ROC AUC'] = roc_auc
    if pr_auc is not None:
        overall_metrics['PR AUC'] = pr_auc
    if top2_acc is not None:
        overall_metrics['Top-2 Accuracy'] = top2_acc
        overall_metrics['Top-3 Accuracy'] = top3_acc
    
    overall_df = pd.DataFrame.from_dict(overall_metrics, orient='index', columns=['Value'])
    print("\nOverall metrics:")
    print(overall_df.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    # ==================== VISUALIZATIONS ====================
    print("\n" + "="*60)
    print("EVALUATION VISUALIZATIONS")
    print("="*60)
    
    # 1. Confusion matrix
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(all_true_labels, all_pred_labels, labels=classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    confusion_matrix_path = os.path.join(log_dir, 'confusion_matrix.png')
    plt.savefig(confusion_matrix_path)
    plt.close()
    print(f"\nConfusion matrix saved to {confusion_matrix_path}")
    
    # 2. Precision-Recall Curve
    plt.figure(figsize=(12, 10))
    if len(classes) == 2:
        precision, recall, _ = precision_recall_curve(all_targets, [p[1] for p in all_probs])
        plt.plot(recall, precision, lw=2, label=f'PR Curve (AUC = {pr_auc:.2f})')
    else:
        for i, class_name in enumerate(classes):
            precision, recall, _ = precision_recall_curve(
                (np.array(all_targets) == class_ids[i]).astype(int),
                np.array(all_probs)[:, i]
            )
            plt.plot(recall, precision, lw=2, label=f'{class_name}')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='best')
    pr_curve_path = os.path.join(log_dir, 'precision_recall_curve.png')
    plt.savefig(pr_curve_path)
    plt.close()
    print(f"Precision-Recall curve saved to {pr_curve_path}")
    
    # 3. ROC Curve (for binary or multiclass)
    if roc_auc is not None:
        plt.figure(figsize=(12, 10))
        if len(classes) == 2:
            fpr, tpr, _ = roc_curve(all_targets, [p[1] for p in all_probs])
            plt.plot(fpr, tpr, lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
        else:
            y_true_bin = label_binarize(all_targets, classes=class_ids)
            for i, class_name in enumerate(classes):
                fpr, tpr, _ = roc_curve(y_true_bin[:, i], np.array(all_probs)[:, i])
                plt.plot(fpr, tpr, lw=2, label=f'{class_name}')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='best')
        roc_curve_path = os.path.join(log_dir, 'roc_curve.png')
        plt.savefig(roc_curve_path)
        plt.close()
        print(f"ROC curve saved to {roc_curve_path}")
    
    # Log all metrics
    logging.info("\n=== DETAILED PER-CLASS EVALUATION METRICS ===")
    logging.info("\nPer-class metrics:\n" + class_metrics.to_markdown(tablefmt="grid", floatfmt=".4f"))
    logging.info("\n=== COMPREHENSIVE OVERALL EVALUATION METRICS ===")
    logging.info("\nOverall metrics:\n" + overall_df.to_markdown(tablefmt="grid", floatfmt=".4f"))
    
    return {
        'class_metrics': class_metrics,
        'overall_metrics': overall_metrics,
        'confusion_matrix': cm,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'visualizations': {
            'confusion_matrix': confusion_matrix_path,
            'pr_curve': pr_curve_path,
            'roc_curve': roc_curve_path if roc_auc is not None else None
        }
    }

In [13]:
# Train and Evaluate with Optuna
def train_and_evaluate(df, test_df, tokenizer, class_weights):
    def objective(trial):
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        batch_size = trial.suggest_categorical("batch_size", [16, 32])
        dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
        n_epochs = trial.suggest_int("n_epochs", 5, 15)
        
        logging.info(f"Trial {trial.number}: lr={learning_rate:.6f}, batch_size={batch_size}, dropout_rate={dropout_rate:.2f}, epochs={n_epochs}")
        
        class SentimentClassifier(nn.Module):
            def __init__(self, n_classes=3, dropout_rate=dropout_rate):
                super(SentimentClassifier, self).__init__()
                self.bert = AutoModel.from_pretrained("vinai/phobert-base")
                self.drop = nn.Dropout(p=dropout_rate)
                self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
                nn.init.normal_(self.fc.weight, std=0.02)
                nn.init.normal_(self.fc.bias, 0)

            def forward(self, input_ids, attention_mask):
                _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
                output = self.drop(pooled_output)
                return self.fc(output)
        
        skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=86)
        fold_accuracies = []
        
        for fold, (train_index, val_index) in enumerate(skf.split(df, df['sentiment'])):
            print(f'\nFold {fold + 1}/{N_SPLITS}')
            train_df = df.iloc[train_index].reset_index(drop=True)
            val_df = df.iloc[val_index].reset_index(drop=True)
            train_loader, val_loader, _ = prepare_loaders(train_df, val_df, test_df, tokenizer, batch_size)
            
            model = SentimentClassifier().to(device)
            criterion = nn.CrossEntropyLoss(weight=class_weights)
            optimizer = AdamW(model.parameters(), lr=learning_rate)
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * n_epochs)
            
            best_val_acc = 0
            patience = 3
            epochs_no_improve = 0
            for epoch in range(n_epochs):
                train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
                val_loss, val_acc = eval_model(model, val_loader, criterion, device)
                logging.info(f"Trial {trial.number}, Fold {fold+1}, Epoch {epoch+1}/{n_epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
                print(f"Epoch {epoch+1}/{n_epochs} - Val Acc: {val_acc:.4f}")
                
                if val_acc > best_val_acc:
                    best_val_acc = val_acc.cpu().item()
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        logging.info(f"Early stopping triggered at epoch {epoch+1} for fold {fold+1}")
                        break
            
            fold_accuracies.append(best_val_acc)
        
        avg_val_acc = np.mean(fold_accuracies)
        logging.info(f"Trial {trial.number} completed with average validation accuracy: {avg_val_acc:.4f}")
        return avg_val_acc
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=3)
    
    best_trial = study.best_trial
    logging.info(f"Best trial: {best_trial.number}")
    logging.info(f"Best validation accuracy: {best_trial.value:.4f}")
    logging.info(f"Best hyperparameters: {best_trial.params}")
    
    best_lr = best_trial.params['learning_rate']
    best_batch_size = best_trial.params['batch_size']
    best_dropout_rate = best_trial.params['dropout_rate']
    best_n_epochs = best_trial.params['n_epochs']
    
    logging.info(f"Training final model with best hyperparameters: lr={best_lr:.6f}, batch_size={best_batch_size}, dropout_rate={best_dropout_rate:.2f}, epochs={best_n_epochs}")
    
    class SentimentClassifier(nn.Module):
        def __init__(self, n_classes=3, dropout_rate=best_dropout_rate):
            super(SentimentClassifier, self).__init__()
            self.bert = AutoModel.from_pretrained("vinai/phobert-base")
            self.drop = nn.Dropout(p=dropout_rate)
            self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
            nn.init.normal_(self.fc.weight, std=0.02)
            nn.init.normal_(self.fc.bias, 0)

        def forward(self, input_ids, attention_mask):
            _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
            output = self.drop(pooled_output)
            return self.fc(output)
    
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=86)
    best_accuracy = 0
    best_model_path = None
    
    for fold, (train_index, val_index) in enumerate(skf.split(df, df['sentiment'])):
        print(f'\nFinal Training - Fold {fold + 1}/{N_SPLITS}')
        train_df = df.iloc[train_index].reset_index(drop=True)
        val_df = df.iloc[val_index].reset_index(drop=True)
        train_loader, val_loader, _ = prepare_loaders(train_df, val_df, test_df, tokenizer, best_batch_size)
        
        model = SentimentClassifier().to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        optimizer = AdamW(model.parameters(), lr=best_lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * best_n_epochs)
        
        best_val_acc = 0
        patience = 3
        epochs_no_improve = 0
        for epoch in range(best_n_epochs):
            train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
            val_loss, val_acc = eval_model(model, val_loader, criterion, device)
            logging.info(f"Final Training, Fold {fold+1}, Epoch {epoch+1}/{best_n_epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
            print(f"Epoch {epoch+1}/{best_n_epochs} - Val Acc: {val_acc:.4f}")
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc.cpu().item()
                epochs_no_improve = 0
                if best_val_acc > best_accuracy:
                    best_accuracy = best_val_acc
                    best_model_path = os.path.join(log_dir, f'PhoBERT_sentiment_temp.bin')
                    torch.save(model.state_dict(), best_model_path)
                    logging.info(f"Saved best model at fold {fold+1}, epoch {epoch+1} with accuracy {val_acc:.4f}")
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= patience:
                    logging.info(f"Early stopping triggered at epoch {epoch+1} for fold {fold+1}")
                    break
    
    model.load_state_dict(torch.load(best_model_path))
    os.remove(best_model_path)
    final_model_path = os.path.join(log_dir, 'PhoBERT_summary_sentiment_optuna.bin')
    torch.save(model.state_dict(), final_model_path)
    logging.info(f"Final best model saved as PhoBERT_summary_sentiment_optuna.bin with accuracy {best_accuracy:.4f}")
    return model


In [14]:
# Inference Function
def predict_sentiment(text, model, tokenizer, id2label):
    dataset = sentimentDataset(pd.DataFrame({'text': [text], 'sentiment': [0]}), tokenizer)
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            _, pred = torch.max(outputs, dim=1)
    return id2label[pred.item()]

In [15]:
# Zip output files for download
def zip_and_download_output_files():
    output_zip = os.path.join(log_dir, 'output_files.zip')
    output_files = [
        'training_summary_sentiment_log_v1.txt',
        'data_processed_for_summary_sentiment.csv',
        'PhoBERT_summary_sentiment_v1.bin',
        'confusion_matrix.png',
        'precision_recall_curve.png',
        'roc_curve.png'
    ]
    
    # Only include existing files
    existing_files = [f for f in output_files if os.path.exists(os.path.join(log_dir, f))]
    missing_files = set(output_files) - set(existing_files)
    
    if missing_files:
        logging.warning(f"Missing files: {missing_files}")
    
    try:
        with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file in existing_files:
                file_path = os.path.join(log_dir, file)
                zipf.write(file_path, file)
                logging.info(f"Added {file} to zip archive")
        
        if os.path.exists(output_zip):
            logging.info(f"Zip file created at {output_zip}")
            
            # Automatic download
            print("\n" + "="*60)
            print("AUTOMATICALLY DOWNLOADING OUTPUT FILES")
            print("="*60)
            
            # For Kaggle
            if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
                print("In Kaggle environment, please download manually:")
                display(FileLink('output_files.zip'))
            # For Google Colab
            else:
                print("Downloading output files automatically...")
                from google.colab import files
                files.download(output_zip)
            
            return output_zip
        else:
            logging.error("Failed to create zip file")
            return None
    except Exception as e:
        logging.error(f"Error creating zip: {e}")
        return None

In [16]:
# Main Execution
if __name__ == "__main__":
    try:
        xlsx_path = '/kaggle/input/1907-data-config-final/data_config_final (1).xlsx'
        if not os.path.exists(xlsx_path):
            raise FileNotFoundError(f"File {xlsx_path} not found!")
        print(f"Found data file at {xlsx_path}")
        print("Loading data...")
        df_processed = load_original_data(xlsx_path)
        print("Cleaning data...")
        df_processed = clean_data(df_processed)
        print("Processing labels...")
        df_processed, label2id, id2label, class_weights = process_labels(df_processed)
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
        print("Tokenizer loaded successfully.")
        print("Tokenizing data...")
        df_processed = tokenize_data(df_processed, tokenizer)

        processed_data_path = os.path.join(log_dir, 'data_processed_for_summary_sentiment.csv')
        df_processed.to_csv(processed_data_path, index=False)
        logging.info(f"Saved processed data with shape: {df_processed.shape}")

        summary_lengths = df_processed['text'].str.len()
        token_lengths = [len(t) for t in df_processed['tokenized']]
        logging.info(f"Summary length stats: Min={summary_lengths.min()}, Max={summary_lengths.max()}, Mean={summary_lengths.mean():.2f}")
        logging.info(f"Token length stats: Min={min(token_lengths)}, Max={max(token_lengths)}, Mean={np.mean(token_lengths):.2f}")
        logging.info(f"Processing config: max_len={MAX_LEN}, n_splits={N_SPLITS}")

        train_val_df, test_df = train_test_split(df_processed, test_size=0.2, stratify=df_processed['sentiment'], random_state=86)
        logging.info(f"Data split: Train+Val={len(train_val_df)}, Test={len(test_df)}")

        print("Starting training with Optuna optimization...")
        model = train_and_evaluate(train_val_df, test_df, tokenizer, class_weights)

        print("Evaluating model...")
        _, _, test_loader = prepare_loaders(train_val_df, train_val_df, test_df, tokenizer, batch_size=16)
        evaluation_results = evaluate_model(model, test_loader, id2label)

        sample_text = "Đầu tư nước ngoài, dù đã tăng trưởng (FDI đạt 8,9 tỷ USD trong 5 tháng đầu năm), nhưng vẫn ghi nhận dòng vốn ngoại rút mạnh do lo ngại rủi ro thương mại, dù được kỳ vọng sẽ phục hồi khi các thị trường nâng hạng."
        predicted_sentiment = predict_sentiment(sample_text, model, tokenizer, id2label)
        print(f"\nSample text: {sample_text}")
        print(f"Predicted sentiment: {predicted_sentiment}")
        logging.info(f"Inference test: Text='{sample_text}', Predicted='{predicted_sentiment}'")

        print("\nPreparing output files for download...")
        zip_and_download_output_files()

    except Exception as e:
        logging.error(f"Error in main execution: {e}", exc_info=True)
        raise

Found data file at /kaggle/input/1907-data-config-final/data_config_final (1).xlsx
Loading data...
Cleaning data...
Processing labels...
Loading tokenizer...


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer loaded successfully.
Tokenizing data...


[I 2025-07-20 03:07:20,512] A new study created in memory with name: no-name-553f7186-9e00-4a47-9a8d-2bac52246e3f


Starting training with Optuna optimization...

Fold 1/3


2025-07-20 03:07:33.869512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752980854.078069      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752980854.138634      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch 1/12 - Val Acc: 0.7664
Epoch 2/12 - Val Acc: 0.8079
Epoch 3/12 - Val Acc: 0.8120
Epoch 4/12 - Val Acc: 0.8123
Epoch 5/12 - Val Acc: 0.8082
Epoch 6/12 - Val Acc: 0.8082
Epoch 7/12 - Val Acc: 0.8051

Fold 2/3
Epoch 1/12 - Val Acc: 0.7711
Epoch 2/12 - Val Acc: 0.7897
Epoch 3/12 - Val Acc: 0.7881
Epoch 4/12 - Val Acc: 0.8007
Epoch 5/12 - Val Acc: 0.7997
Epoch 6/12 - Val Acc: 0.8114
Epoch 7/12 - Val Acc: 0.8041
Epoch 8/12 - Val Acc: 0.7997
Epoch 9/12 - Val Acc: 0.8067

Fold 3/3
Epoch 1/12 - Val Acc: 0.6410
Epoch 2/12 - Val Acc: 0.8045
Epoch 3/12 - Val Acc: 0.8051
Epoch 4/12 - Val Acc: 0.7913
Epoch 5/12 - Val Acc: 0.7960
Epoch 6/12 - Val Acc: 0.8086
Epoch 7/12 - Val Acc: 0.8064
Epoch 8/12 - Val Acc: 0.7957


[I 2025-07-20 05:27:11,126] Trial 0 finished with value: 0.8107513360578434 and parameters: {'learning_rate': 3.98365088321791e-05, 'batch_size': 32, 'dropout_rate': 0.3220327060942979, 'n_epochs': 12}. Best is trial 0 with value: 0.8107513360578434.


Epoch 9/12 - Val Acc: 0.8073

Fold 1/3
Epoch 1/9 - Val Acc: 0.7872
Epoch 2/9 - Val Acc: 0.7935
Epoch 3/9 - Val Acc: 0.7982
Epoch 4/9 - Val Acc: 0.8076
Epoch 5/9 - Val Acc: 0.8060
Epoch 6/9 - Val Acc: 0.8095
Epoch 7/9 - Val Acc: 0.8126
Epoch 8/9 - Val Acc: 0.8164
Epoch 9/9 - Val Acc: 0.8152

Fold 2/3
Epoch 1/9 - Val Acc: 0.7699
Epoch 2/9 - Val Acc: 0.7749
Epoch 3/9 - Val Acc: 0.7966
Epoch 4/9 - Val Acc: 0.7997
Epoch 5/9 - Val Acc: 0.7979
Epoch 6/9 - Val Acc: 0.8035
Epoch 7/9 - Val Acc: 0.8070
Epoch 8/9 - Val Acc: 0.8041
Epoch 9/9 - Val Acc: 0.8035

Fold 3/3
Epoch 1/9 - Val Acc: 0.7724
Epoch 2/9 - Val Acc: 0.8016
Epoch 3/9 - Val Acc: 0.8007
Epoch 4/9 - Val Acc: 0.7931


[I 2025-07-20 07:37:00,116] Trial 1 finished with value: 0.8083411924971182 and parameters: {'learning_rate': 4.300648024218273e-05, 'batch_size': 16, 'dropout_rate': 0.22822357345940741, 'n_epochs': 9}. Best is trial 0 with value: 0.8107513360578434.


Epoch 5/9 - Val Acc: 0.7988

Fold 1/3
Epoch 1/6 - Val Acc: 0.7554
Epoch 2/6 - Val Acc: 0.7997
Epoch 3/6 - Val Acc: 0.8104
Epoch 4/6 - Val Acc: 0.8133
Epoch 5/6 - Val Acc: 0.8126
Epoch 6/6 - Val Acc: 0.8142

Fold 2/3
Epoch 1/6 - Val Acc: 0.7809
Epoch 2/6 - Val Acc: 0.8032
Epoch 3/6 - Val Acc: 0.7831
Epoch 4/6 - Val Acc: 0.7988
Epoch 5/6 - Val Acc: 0.8117
Epoch 6/6 - Val Acc: 0.8101

Fold 3/3
Epoch 1/6 - Val Acc: 0.7853
Epoch 2/6 - Val Acc: 0.7784
Epoch 3/6 - Val Acc: 0.8023
Epoch 4/6 - Val Acc: 0.8041
Epoch 5/6 - Val Acc: 0.7997


[I 2025-07-20 09:18:37,062] Trial 2 finished with value: 0.8122183799643717 and parameters: {'learning_rate': 2.0317725126468357e-05, 'batch_size': 16, 'dropout_rate': 0.40864433439381564, 'n_epochs': 6}. Best is trial 2 with value: 0.8122183799643717.


Epoch 6/6 - Val Acc: 0.8108

Final Training - Fold 1/3
Epoch 1/6 - Val Acc: 0.8048
Epoch 2/6 - Val Acc: 0.8104
Epoch 3/6 - Val Acc: 0.8073
Epoch 4/6 - Val Acc: 0.8098
Epoch 5/6 - Val Acc: 0.8186
Epoch 6/6 - Val Acc: 0.8155

Final Training - Fold 2/3
Epoch 1/6 - Val Acc: 0.7586
Epoch 2/6 - Val Acc: 0.7988
Epoch 3/6 - Val Acc: 0.7928
Epoch 4/6 - Val Acc: 0.8067
Epoch 5/6 - Val Acc: 0.8010
Epoch 6/6 - Val Acc: 0.8023

Final Training - Fold 3/3
Epoch 1/6 - Val Acc: 0.7891
Epoch 2/6 - Val Acc: 0.7985
Epoch 3/6 - Val Acc: 0.7916
Epoch 4/6 - Val Acc: 0.8054
Epoch 5/6 - Val Acc: 0.8089
Epoch 6/6 - Val Acc: 0.8120
Evaluating model...

DETAILED PER-CLASS EVALUATION METRICS

Per-class metrics:
+----------+-------------+----------+------------+-----------+
|          |   precision |   recall |   f1-score |   support |
| Positive |      0.8705 |   0.8787 |     0.8746 |  635.0000 |
+----------+-------------+----------+------------+-----------+
| Negative |      0.7008 |   0.6997 |     0.7002 |  636.




Confusion matrix saved to /kaggle/working/confusion_matrix.png
Precision-Recall curve saved to /kaggle/working/precision_recall_curve.png


2025-07-20 11:00:53,858 - ERROR - Error in main execution: name 'sentimentDataset' is not defined
Traceback (most recent call last):
  File "/tmp/ipykernel_36/1940655137.py", line 41, in <cell line: 0>
    predicted_sentiment = predict_sentiment(sample_text, model, tokenizer, id2label)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_36/1480475271.py", line 3, in predict_sentiment
    dataset = sentimentDataset(pd.DataFrame({'text': [text], 'sentiment': [0]}), tokenizer)
              ^^^^^^^^^^^^^^^^
NameError: name 'sentimentDataset' is not defined


ROC curve saved to /kaggle/working/roc_curve.png


NameError: name 'sentimentDataset' is not defined