# **Vietnamese News Sentiment Analysis with XLM-RoBERTa**
# This notebook fine-tunes XLM-RoBERTa for classifying sentiment of Vietnamese news summaries.

In [1]:
# Install required packages
!pip install -q transformers datasets evaluate accelerate wandb scikit-learn pandas matplotlib seaborn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import shutil
import zipfile
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
from datetime import datetime
import logging

2025-07-02 16:20:53.384050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751473253.610600      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751473253.677740      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Configuration
class Config:
    MODEL_NAME = "xlm-roberta-base"
    SEED = 42
    BATCH_SIZE = 16
    GRADIENT_ACCUMULATION_STEPS = 4
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 10
    MAX_LENGTH = 256
    WEIGHT_DECAY = 0.01
    OUTPUT_DIR = "./xlm-roberta-sentiment-complete"
    LOGGING_STEPS = 50
    SAVE_TOTAL_LIMIT = 2
    SENTIMENT_MAP = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    REVERSE_SENTIMENT_MAP = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    EARLY_STOPPING_PATIENCE = 3
    LR_SCHEDULER_TYPE = "cosine"
    WARMUP_RATIO = 0.1
    USE_CLASS_WEIGHTS = True
    DATA_PATH = "/kaggle/input/data-summary-sentiment/Data_summary_sentiment.xlsx"

config = Config()

In [4]:
# Create output directory
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

# Set up logging
logging.basicConfig(
    filename=os.path.join(config.OUTPUT_DIR, 'training.log'),
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set random seed
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)

# Initialize W&B
wandb.init(project="vietnamese-sentiment-analysis", mode="disabled")

In [5]:
# Custom Trainer with class weights - FIXED VERSION
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Convert labels to long type to fix the RuntimeError
        labels = labels.long()
        
        if self.class_weights is not None:
            weights = torch.tensor(self.class_weights, device=logits.device, dtype=torch.float32)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [6]:
# Load and explore dataset
def load_and_explore_data(file_path):
    logger.info("Loading dataset...")
    df = pd.read_excel(file_path)
    
    # Map sentiment labels to numbers
    df['label'] = df['sentiment'].map(config.SENTIMENT_MAP)
    
    # Save dataset info
    with open(os.path.join(config.OUTPUT_DIR, 'dataset_info.txt'), 'w') as f:
        f.write(f"Total samples: {len(df)}\n")
        f.write("\nClass distribution:\n")
        f.write(df['sentiment'].value_counts().to_string())
    
    # Plot class distribution
    plt.figure(figsize=(8, 5))
    class_dist = df['sentiment'].value_counts()
    sns.barplot(x=class_dist.index, y=class_dist.values)
    plt.title('Class Distribution')
    plt.ylabel('Count')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'class_distribution.png'))
    plt.close()
    
    # Text length analysis
    df['text_length'] = df['summary'].apply(lambda x: len(x.split()))
    
    # Plot text length distribution
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df['text_length'], bins=30)
    plt.title('Text Length Distribution')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x='sentiment', y='text_length', data=df)
    plt.title('Text Length by Sentiment')
    plt.savefig(os.path.join(config.OUTPUT_DIR, 'text_length_distribution.png'))
    plt.close()
    
    return df

# Load your dataset
df = load_and_explore_data(config.DATA_PATH)
df = df.dropna()  # Clean data

# Compute class weights if needed
if config.USE_CLASS_WEIGHTS:
    class_weights = compute_class_weight(
        'balanced', 
        classes=np.unique(df['label']),
        y=df['label']
    )
    config.CLASS_WEIGHTS = class_weights.tolist()
    logger.info(f"Class weights: {config.CLASS_WEIGHTS}")
else:
    config.CLASS_WEIGHTS = None

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

  with pd.option_context('mode.use_inf_as_na', True):


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
# Analyze token lengths
def analyze_token_lengths(texts, tokenizer, max_length):
    lengths = []
    for text in texts:
        tokens = tokenizer(text, truncation=True, max_length=max_length)["input_ids"]
        lengths.append(len(tokens))
    return lengths

token_lengths = analyze_token_lengths(df['summary'], tokenizer, config.MAX_LENGTH)

plt.figure(figsize=(10, 5))
sns.histplot(token_lengths, bins=30)
plt.title('Token Length Distribution')
plt.axvline(x=config.MAX_LENGTH, color='r', linestyle='--', label='Max Length')
plt.legend()
plt.savefig(os.path.join(config.OUTPUT_DIR, 'token_length_distribution.png'))
plt.close()

logger.info(f"Percentage of texts within max length: {sum(np.array(token_lengths) <= config.MAX_LENGTH) / len(token_lengths):.2%}")

  with pd.option_context('mode.use_inf_as_na', True):


In [8]:
# Preprocess function
def preprocess_function(examples):
    return tokenizer(
        examples["summary"],
        truncation=True,
        max_length=config.MAX_LENGTH,
        padding="max_length"
    )

# Split data (stratified by sentiment)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=config.SEED,
    stratify=df['label']
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=config.SEED,
    stratify=temp_df['label']
)

# Convert to Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Tokenize datasets
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["summary", "sentiment"]
)

# Save dataset splits info
with open(os.path.join(config.OUTPUT_DIR, 'data_splits.txt'), 'w') as f:
    f.write(f"Train samples: {len(train_df)}\n")
    f.write(f"Validation samples: {len(val_df)}\n")
    f.write(f"Test samples: {len(test_df)}\n")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=3
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4142 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Get classification report
    report = classification_report(
        labels,
        predictions,
        target_names=['Negative', 'Neutral', 'Positive'],
        output_dict=True
    )
    
    # Create metrics dictionary
    metrics = {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'negative_precision': report['Negative']['precision'],
        'negative_recall': report['Negative']['recall'],
        'negative_f1': report['Negative']['f1-score'],
        'neutral_precision': report['Neutral']['precision'],
        'neutral_recall': report['Neutral']['recall'],
        'neutral_f1': report['Neutral']['f1-score'],
        'positive_precision': report['Positive']['precision'],
        'positive_recall': report['Positive']['recall'],
        'positive_f1': report['Positive']['f1-score']
    }
    
    # Log metrics
    logger.info(f"Evaluation metrics: {metrics}")
    
    return metrics

In [10]:
# Training arguments - FIXED VERSION
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    run_name=f"xlm-roberta-sentiment-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=config.LOGGING_STEPS,
    save_steps=100,
    save_total_limit=config.SAVE_TOTAL_LIMIT,
    learning_rate=config.LEARNING_RATE,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=config.NUM_EPOCHS,
    weight_decay=config.WEIGHT_DECAY,
    lr_scheduler_type=config.LR_SCHEDULER_TYPE,
    warmup_ratio=config.WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    fp16=True,
    report_to="wandb",
    logging_dir="./logs",
    seed=config.SEED
)

# Initialize Trainer - FIXED VERSION
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=config.CLASS_WEIGHTS,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)]
)

  super().__init__(**kwargs)


In [11]:
# Start training
logger.info("Starting training...")
print("Starting training...")
train_result = trainer.train()

# Save training metrics
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
logger.info(f"Training metrics: {metrics}")

# Save the final model
trainer.save_model(config.OUTPUT_DIR)
tokenizer.save_pretrained(config.OUTPUT_DIR)
logger.info(f"Model saved to {config.OUTPUT_DIR}")

# Save training arguments
trainer.save_state()

# Evaluate on test set
logger.info("Evaluating on test set...")
print("Evaluating on test set...")
test_results = trainer.evaluate(
    tokenized_datasets["test"],
    metric_key_prefix="test"
)

Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted,Negative Precision,Negative Recall,Negative F1,Neutral Precision,Neutral Recall,Neutral F1,Positive Precision,Positive Recall,Positive F1
100,0.6248,0.59461,0.76834,0.76834,0.736337,0.764822,0.764286,0.862903,0.810606,0.576271,0.515152,0.544,0.857692,0.851145,0.854406
200,0.3765,0.61424,0.76834,0.76834,0.747934,0.770783,0.832,0.83871,0.835341,0.546099,0.583333,0.564103,0.861111,0.828244,0.844358
300,0.2675,0.659165,0.787645,0.787645,0.760895,0.786553,0.830645,0.830645,0.830645,0.59375,0.575758,0.584615,0.860902,0.874046,0.867424


Evaluating on test set...


early stopping required metric_for_best_model, but did not find eval_f1_macro so early stopping is disabled


In [12]:
# Save evaluation results
with open(os.path.join(config.OUTPUT_DIR, 'test_results.txt'), 'w') as f:
    for key, value in test_results.items():
        f.write(f"{key}: {value}\n")

logger.info("\n=== Test Results ===")
print("\n=== Test Results ===")
for key, value in test_results.items():
    if key.startswith("test_"):
        logger.info(f"{key[5:]}: {value}")
        print(f"{key[5:]}: {value}")


=== Test Results ===
loss: 0.7746849656105042
accuracy: 0.7683397683397684
f1_micro: 0.7683397683397682
f1_macro: 0.735117688322881
f1_weighted: 0.7647372131047858
negative_precision: 0.7375886524822695
negative_recall: 0.832
negative_f1: 0.781954887218045
neutral_precision: 0.6086956521739131
neutral_recall: 0.5303030303030303
neutral_f1: 0.5668016194331984
positive_precision: 0.8549618320610687
positive_recall: 0.8582375478927203
positive_f1: 0.8565965583173996
runtime: 5.3568
samples_per_second: 96.699
steps_per_second: 3.174


In [13]:
# Sample predictions function
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        max_length=config.MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(trainer.model.device)
    
    with torch.no_grad():
        outputs = trainer.model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_class = torch.argmax(probs).item()
    
    return {
        "sentiment": config.REVERSE_SENTIMENT_MAP[pred_class],
        "confidence": probs[0][pred_class].item(),
        "probabilities": {
            "Negative": probs[0][0].item(),
            "Neutral": probs[0][1].item(),
            "Positive": probs[0][2].item()
        }
    }

# Test on some samples and save predictions
sample_texts = df.sample(5, random_state=config.SEED)["summary"].tolist()
with open(os.path.join(config.OUTPUT_DIR, 'sample_predictions.txt'), 'w') as f:
    for i, text in enumerate(sample_texts):
        result = predict_sentiment(text)
        actual = df[df['summary'] == text]['sentiment'].values[0]
        
        f.write(f"\n=== Sample {i+1} ===\n")
        f.write(f"\nText: {text}\n")
        f.write(f"\nPredicted Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})\n")
        f.write(f"Probabilities: {result['probabilities']}\n")
        f.write(f"Actual Sentiment: {actual}\n")
        
        logger.info(f"Sample {i+1} - Predicted: {result['sentiment']}, Actual: {actual}")
        print(f"\n=== Sample {i+1} ===")
        print(f"\nText: {text}")
        print(f"\nPredicted Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})")
        print(f"Probabilities: {result['probabilities']}")
        print(f"Actual Sentiment: {actual}")


=== Sample 1 ===

Text: Hội thảo tại Phú Quốc bàn về phát triển BĐS du lịch nghỉ dưỡng gắn với APEC 2027. Giai đoạn 2019-2024, BĐS Phú Quốc trầm lắng do dư cung. APEC 2027 tạo động lực phục hồi, giao dịch tăng ở khu vực ven biển, sông, đại lộ. Phú Quốc cần điều chỉnh quy hoạch, nâng cấp hạ tầng đón APEC 2027. Địa phương tập trung phát triển hạ tầng đô thị, du lịch, triển khai nhiều dự án lớn.


Predicted Sentiment: Positive (Confidence: 0.91)
Probabilities: {'Negative': 0.002221547532826662, 'Neutral': 0.09234996140003204, 'Positive': 0.9054285287857056}
Actual Sentiment: Positive

=== Sample 2 ===

Text: Phó thủ tướng Bùi Thanh Sơn chủ trì phiên họp về cao điểm chống buôn lậu, gian lận thương mại, hàng giả. Các bộ ngành được yêu cầu kiểm tra, xử lý nghiêm vi phạm khi có phản ánh từ người dân, tập trung vào dược phẩm, thực phẩm chức năng. Cục An toàn thực phẩm yêu cầu kiểm tra quảng cáo sản phẩm Nestlé Milo, sau phản ánh sử dụng thông tin Viện Dinh dưỡng không đúng. Sở Y tế TP.HCM và 

In [14]:
# Create zip file of all outputs
def zip_output_folder(output_dir):
    zip_path = os.path.join(output_dir, 'output.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                if file != 'output.zip':  # Skip the zip file itself
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, output_dir)
                    zipf.write(file_path, arcname)
    return zip_path

output_zip = zip_output_folder(config.OUTPUT_DIR)
logger.info(f"Created zip file at: {output_zip}")

# Download the zip file automatically
from IPython.display import FileLink

print("Training complete! Download the results:")
FileLink(output_zip)

# Close W&B
wandb.finish()
logger.info("Training process completed successfully")

Training complete! Download the results:


In [15]:
!zip -r /kaggle/working/xlm-roberta-sentiment-complete.zip /kaggle/working/xlm-roberta-sentiment-complete


  adding: kaggle/working/xlm-roberta-sentiment-complete/ (stored 0%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/training_args.bin (deflated 51%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/sentencepiece.bpe.model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 49%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/token_length_distribution.png (deflated 20%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/tokenizer.json (deflated 76%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/special_tokens_map.json (deflated 52%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/output.zip (stored 0%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/checkpoint-320/ (stored 0%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/checkpoint-320/training_args.bin (deflated 51%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/checkpoint-320/sentencepiece.bpe.model (deflated 49%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/checkpoint-320/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/xlm-roberta-sentiment-complete/checkpoint-320/tokenizer.json (deflated 76%)
  adding: ka