In [1]:
import pandas as pd
import gc
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, f1_score
import random 
import warnings
import transformers
import torch

warnings.filterwarnings('ignore', category=FutureWarning)
transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


### Load data

In [3]:
# HuggingFace dataset
dataset = Dataset.from_csv('../data/cleaned_for_LLM.csv')
print(f"Loaded {len(dataset):,} rows")

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

Loaded 22,624,379 rows


## RoBERTa Tokenization

In [4]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

aspect_cols = [
    'food_quality_positive', 'food_quality_negative', 'food_quality_neutral',
    'service_positive', 'service_negative', 'service_neutral',
    'wait_time_positive', 'wait_time_negative', 'wait_time_neutral',
    'price_value_positive', 'price_value_negative', 'price_value_neutral',
    'cleanliness_positive', 'cleanliness_negative', 'cleanliness_neutral',
    'atmosphere_positive', 'atmosphere_negative', 'atmosphere_neutral'
]

def tokenize_function(examples):
    """Tokenize text and prepare multi-label targets."""
    tokenized = tokenizer(
        examples['text'],
        padding='longest',  # only pads longest batch, not always full 128
        truncation=True,
        max_length=128     
    )
    labels = []
    for i in range(len(examples['text'])):
        label_row = [float(examples[col][i]) for col in aspect_cols]
        labels.append(label_row)
    tokenized['labels'] = labels
    return tokenized

# Split
split = dataset.train_test_split(test_size=0.1, seed=2)

# Tokenize
print("Tokenizing datasets...")
train_dataset = split['train'].map(tokenize_function, batched=True, remove_columns=split['train'].column_names)
test_dataset = split['test'].map(tokenize_function, batched=True, remove_columns=split['test'].column_names)

# Save
print("Saving datasets...")
train_dataset.save_to_disk('../data/bert_train_tokenized')
test_dataset.save_to_disk('../data/bert_test_tokenized')

print(f"Train dataset: {len(train_dataset):,} samples")
print(f"Test dataset:  {len(test_dataset):,} samples")

Tokenizing datasets...


Map:   0%|          | 0/2262438 [00:00<?, ? examples/s]

Saving datasets...


Saving the dataset (0/33 shards):   0%|          | 0/20361941 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/2262438 [00:00<?, ? examples/s]

Train dataset: 20,361,941 samples
Test dataset:  2,262,438 samples


***Note:*** 0% is a display bug in the notebook/library version.

### Sample data

In [7]:
train_indices = random.sample(range(len(train_dataset)), 22500)
train_subset = train_dataset.select(train_indices)

test_indices = random.sample(range(len(test_dataset)), 2500)
test_subset = test_dataset.select(test_indices)

print(f"Train subset: {len(train_subset):,}")
print(f"Test subset:  {len(test_subset):,}")

Train subset: 22,500
Test subset:  2,500


### Evaluation Metrics

In [9]:
# Define metrics
def compute_metrics(pred):
    """Compute evaluation metrics for multi-label classification."""
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## RoBERTa-base (25k Sample)

### Train

In [10]:
print("Loading RoBERTa model...")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)
print(f"Model loaded with {len(label_cols)} output labels\n")

# Training configuration
training_args = TrainingArguments(
    output_dir='./results_distilbert',           # Directory to save model checkpoints
    eval_strategy="epoch",                       # Default: "no" | Evaluate at end of each epoch
    save_strategy="epoch",                       # Default: "steps" | Save checkpoint at end of each epoch
    load_best_model_at_end=True,                 # Default: False | Load best checkpoint after training
    metric_for_best_model='f1',                  # Default: "loss" | Use F1 score to determine best model
    logging_steps=100,                           # Default: 500 | Log metrics every 100 steps
    per_device_train_batch_size=32,              # Default: 8 | Batch size for training (reduce for limited hardware)
    per_device_eval_batch_size=64,               # Default: 8 | Batch size for evaluation
    dataloader_num_workers=4,                    # Default: 0 | Parallel data loading (reduce for limited hardware)
    num_train_epochs=3,                          # Default: 3 | Number of training epochs
    dataloader_pin_memory=False,                 # Suppress MPS  warning
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=test_subset,
    compute_metrics=compute_metrics,
)

print("Starting training...\n")
trainer.train()



Loading RoBERTa model...


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Model loaded with 18 output labels

Starting training...



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.060443,0.057929,0.8352,0.782257,0.814565,0.783551
2,0.046927,0.045346,0.8572,0.836677,0.887658,0.82106
3,0.037241,0.045397,0.8672,0.844941,0.877104,0.832072


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=2112, training_loss=0.06492481406100771, metrics={'train_runtime': 6456.9252, 'train_samples_per_second': 10.454, 'train_steps_per_second': 0.327, 'total_flos': 1.776254759424e+16, 'train_loss': 0.06492481406100771, 'epoch': 3.0})

### Evaluate

In [11]:
# Get predictions on test set
predictions = trainer.predict(test_dataset)
y_pred = (predictions.predictions > 0.5).astype(int)
y_true = predictions.label_ids

print(f"RoBERTa ({SAMPLE_SIZE:,} Sample):")
print()
print(f"Training Accuracy: {trainer.evaluate(train_dataset)['eval_accuracy']:.4f}")
print(f"Test Accuracy:     {trainer.evaluate(test_dataset)['eval_accuracy']:.4f}")
print()
print(f"F1 Score (macro):    {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print(f"F1 Score (weighted): {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
print()

print("Classification Report:")
print(classification_report(y_true, y_pred, zero_division=0))
print()
print("\nLabel Index Key:")
for i, label in enumerate(label_cols):
    print(f"  {i}: {label}")

RoBERTa (25,000 Sample):



Training Accuracy: 0.9095
Test Accuracy:     0.8672

F1 Score (macro):    0.5889
F1 Score (weighted): 0.8449

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       723
           1       0.72      0.64      0.68        80
           2       0.46      0.14      0.21        88
           3       0.96      0.95      0.96       607
           4       0.84      0.84      0.84       110
           5       0.53      0.16      0.25        50
           6       0.92      0.90      0.91       327
           7       0.83      0.75      0.79        76
           8       0.50      0.10      0.17        50
           9       0.92      0.95      0.93       260
          10       0.86      0.62      0.72        61
          11       0.20      0.02      0.04        44
          12       0.96      0.94      0.95       132
          13       0.89      0.70      0.78        23
          14       0.00      0.00      0.00        16
  