In [1]:
import pandas as pd
import gc
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, f1_score
import random 
import warnings
import transformers
import torch
import torch.nn as nn


warnings.filterwarnings('ignore', category=FutureWarning)
transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

2026-02-22 22:13:49.201353: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-02-22 22:13:49.201396: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-02-22 22:13:49.202855: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-02-22 22:13:49.214197: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


### Load tokenized dataset

In [2]:
# Dataset stays on disk, rows load only when needed
tokenized_dataset = load_from_disk('tokenized_for_roberta')
print(f"Loaded {len(tokenized_dataset):,} rows")

aspect_cols = [
             'product_quality_positive', 'product_quality_negative',
             'service_positive', 'service_negative',
             'wait_time_positive', 'wait_time_negative',
             'price_value_positive', 'price_value_negative',
             'cleanliness_positive', 'cleanliness_negative',
             'atmosphere_positive', 'atmosphere_negative',
             'general_positive', 'general_negative'
             ]

Loading dataset from disk:   0%|          | 0/35 [00:00<?, ?it/s]

Loaded 22,614,379 rows


### Evaluation Metrics

In [3]:
# Define metrics
def compute_metrics(pred):
    """Compute evaluation metrics for multi-label classification."""
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# RoBERTa-base (100k Sample + 2x Neg)

### Train

In [4]:
# Configuration for each run
SAMPLE_SIZE   = 100_000
TEST_SIZE     = 0.1
EPOCHS        = 3
TRAIN_BATCH   = 64
EVAL_BATCH    = 128
LEARNING_RATE = 2e-5
SEED          = 2

# Apply class weights: 2x penalty on negative labels (odd indices)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = nn.BCEWithLogitsLoss(pos_weight=weights)(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

NEG_WEIGHT = 2.0
weights = torch.ones(len(aspect_cols))
weights[1::2] = NEG_WEIGHT  # every other label starting at index 1 is negative
weights = weights.to(device)

# Sample & Split
random.seed(SEED)
sample       = tokenized_dataset.select(random.sample(range(len(tokenized_dataset)), SAMPLE_SIZE))
split        = sample.train_test_split(test_size=TEST_SIZE, seed=SEED)
train_subset = split['train']
test_subset  = split['test']

print(f"Train subset: {len(train_subset):,}")
print(f"Test subset:  {len(test_subset):,}")

# Load RoBERTa-base Model 
print("Loading RoBERTa model...")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(aspect_cols),
    problem_type="multi_label_classification"
)
print(f"Model loaded with {len(aspect_cols)} output labels\n")

# Training Arguments
training_args = TrainingArguments(
    output_dir=f'./results_roberta_{SAMPLE_SIZE}',
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=torch.cuda.is_available(),
    dataloader_num_workers=16,
    optim="adamw_torch_fused",
)

# Train Model
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=test_subset,
    compute_metrics=compute_metrics,
)
print("Starting training...\n")
trainer.train()

Train subset: 90,000
Test subset:  10,000
Loading RoBERTa model...
Model loaded with 14 output labels

Starting training...

{'loss': 0.1499, 'grad_norm': 0.30931419134140015, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}
{'eval_loss': 0.08908729255199432, 'eval_accuracy': 0.8933, 'eval_f1': 0.916970687272749, 'eval_precision': 0.9256621110101607, 'eval_recall': 0.9133381478806732, 'eval_runtime': 54.1186, 'eval_samples_per_second': 184.779, 'eval_steps_per_second': 1.46, 'epoch': 1.0}
{'loss': 0.0749, 'grad_norm': 1.9435322284698486, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}
{'eval_loss': 0.07328711450099945, 'eval_accuracy': 0.9101, 'eval_f1': 0.927333552744421, 'eval_precision': 0.930464043541008, 'eval_recall': 0.9244486417421255, 'eval_runtime': 54.2656, 'eval_samples_per_second': 184.279, 'eval_steps_per_second': 1.456, 'epoch': 2.0}
{'loss': 0.0605, 'grad_norm': 2.247819423675537, 'learning_rate': 0.0, 'epoch': 3.0}
{'eval_loss': 0.07081868499517441, 'eval_a

TrainOutput(global_step=4221, training_loss=0.0951012631949303, metrics={'train_runtime': 5049.2064, 'train_samples_per_second': 53.474, 'train_steps_per_second': 0.836, 'train_loss': 0.0951012631949303, 'epoch': 3.0})

### Evaluate

In [5]:
predictions = trainer.predict(test_subset)
y_pred = (predictions.predictions > 0.5).astype(int)
y_true = predictions.label_ids

print(f"\nRoBERTa ({SAMPLE_SIZE:,} Sample + 2x Penalty)")
print(f"Training Accuracy: {trainer.evaluate(train_subset)['eval_accuracy']:.4f}")
print(f"Test Accuracy:     {trainer.evaluate(test_subset)['eval_accuracy']:.4f}")

print(f"\nF1 Score (macro):    {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print(f"F1 Score (weighted): {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=aspect_cols, zero_division=0))


RoBERTa (100,000 Sample + 2x Penalty)
{'eval_loss': 0.05093773454427719, 'eval_accuracy': 0.9372111111111111, 'eval_f1': 0.9541620808327217, 'eval_precision': 0.9601165676547125, 'eval_recall': 0.9485402990967869, 'eval_runtime': 482.0834, 'eval_samples_per_second': 186.69, 'eval_steps_per_second': 1.46, 'epoch': 3.0}
Training Accuracy: 0.9372
{'eval_loss': 0.07081868499517441, 'eval_accuracy': 0.9153, 'eval_f1': 0.9288570706841401, 'eval_precision': 0.9345999069748724, 'eval_recall': 0.9237820121104383, 'eval_runtime': 54.254, 'eval_samples_per_second': 184.318, 'eval_steps_per_second': 1.456, 'epoch': 3.0}
Test Accuracy:     0.9153

F1 Score (macro):    0.8794
F1 Score (weighted): 0.9289

Classification Report:
                          precision    recall  f1-score   support

product_quality_positive       0.96      0.96      0.96      3117
product_quality_negative       0.85      0.81      0.83       771
        service_positive       0.96      0.97      0.96      2416
        ser

In [9]:
# Clear memory
del model
del trainer
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None