In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    balanced_accuracy_score, roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results_dir = 'results/cyberbert'
os.makedirs(results_dir, exist_ok=True)
device

device(type='cuda')

In [14]:
fold_num = 2
train_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_train.csv')
test_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_test.csv')

train_data = train_data.replace({'f': 0, 't': 1})
test_data = test_data.replace({'f': 0, 't': 1})

data = pd.concat([train_data, test_data], ignore_index=True)
train_idx = pd.Series([True] * len(train_data) + [False] * len(test_data))
test_idx = pd.Series([False] * len(train_data) + [True] * len(test_data))

print(f"Using fold {fold_num} - Train: {len(train_data)}, Test: {len(test_data)}, Total: {len(data)}")
data.shape

Using fold 2 - Train: 85292, Test: 21326, Total: 106618


(106618, 63)

In [15]:
def get_target(data):
    return data['c_cyberbullying_majority'].astype(int)

def create_severity_labels(data):
    severity_cols = ['c_severity_mild_count', 'c_severity_moderate_count', 'c_severity_severe_count']
    severity_counts = data[severity_cols].fillna(0)
    cb_mask = data['c_cyberbullying_majority'] == 1
    severity_labels = np.zeros(len(data))
    
    for idx in data.index:
        if not cb_mask.iloc[idx]:
            severity_labels[idx] = 0
        else:
            row = severity_counts.iloc[idx]
            if row.sum() == 0:
                severity_labels[idx] = 0
            else:
                max_idx = row.argmax()
                severity_labels[idx] = max_idx + 1
    return severity_labels.astype(int)

def create_topic_labels(data):
    topic_cols = ['c_topic_disability_majority', 'c_topic_gender_majority', 'c_topic_intellectual_majority',
                  'c_topic_other_majority', 'c_topic_physical_majority', 'c_topic_political_majority',
                  'c_topic_race_majority', 'c_topic_religious_majority', 'c_topic_sexual_majority',
                  'c_topic_social_status_majority']
    topic_data = data[topic_cols].fillna(0).astype(int)
    cb_mask = data['c_cyberbullying_majority'] == 1
    topic_labels = topic_data.copy()
    topic_labels[~cb_mask] = 0
    return topic_labels, topic_cols

def create_role_labels(data):
    cb_roles = ['c_role_bully_count', 'c_role_cb__bully_assistant_count', 
                'c_role_cb_aggressive_victim_role_count', 'c_role_cb_aggressive_defender_count']
    noncb_roles = ['c_role_noncb_passive_bystander_count', 'c_role_noncb_non_aggressive_victim_count',
                   'c_role_noncb_non_aggressive_defender_count']
    role_labels = []
    cb_mask = data['c_cyberbullying_majority'] == 1
    
    for idx in data.index:
        if cb_mask.iloc[idx]:
            role_counts = data[cb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role)
            else:
                role_labels.append(0)
        else:
            role_counts = data[noncb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role + 4)
            else:
                role_labels.append(4)
    return np.array(role_labels)

def save_metrics(y_true, y_pred, y_proba, class_names, task_name, model_name='cyberbert'):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    
    metrics['precision_macro'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['recall_macro'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    metrics['precision_weighted'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['recall_weighted'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    metrics['precision_per_class'] = precision_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['recall_per_class'] = recall_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['f1_per_class'] = f1_score(y_true, y_pred, average=None, zero_division=0).tolist()
    
    if len(np.unique(y_true)) == 2:
        if y_proba is not None and y_proba.shape[1] >= 2:
            metrics['auroc'] = roc_auc_score(y_true, y_proba[:, 1])
        else:
            metrics['auroc'] = None
    else:
        if y_proba is not None:
            try:
                metrics['auroc_macro'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
                metrics['auroc_weighted'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
            except:
                metrics['auroc_macro'] = None
                metrics['auroc_weighted'] = None
        else:
            metrics['auroc_macro'] = None
            metrics['auroc_weighted'] = None
    
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'{model_name.upper()} - {task_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'{results_dir}/{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    with open(f'{results_dir}/{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    np.save(f'{results_dir}/{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.npy', cm)
    
    print(f"Saved metrics for {task_name} to {results_dir}")
    
    return metrics

In [16]:
target = get_target(data)
severity_labels = create_severity_labels(data)
topic_labels, topic_cols = create_topic_labels(data)
role_labels = create_role_labels(data)

text_data = data['c_comment_content'].fillna('').astype(str)

f"Data: {data.shape}, Train: {train_idx.sum()}, Test: {test_idx.sum()}"

'Data: (106618, 63), Train: 85292, Test: 21326'

In [17]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [18]:
def freeze_base_model(model):
    for name, param in model.named_parameters():
        if 'classifier' not in name and 'pooler' not in name:
            param.requires_grad = False
    return model

def train_transformer_model(model_name, tokenizer_class, model_class, train_texts, train_labels, test_texts, test_labels, num_labels, task_name):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(
        model_name, 
        num_labels=num_labels,
        ignore_mismatched_sizes=True,
        problem_type="single_label_classification",
        classifier_dropout=0.1
    ).to(device)
    
    model = freeze_base_model(model)
    
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextDataset(test_texts, test_labels, tokenizer)
    
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    
    trainer.train()
    
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    pred_labels = np.argmax(predictions.predictions, axis=1)
    
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return pred_labels, pred_probs

In [19]:
print("BINARY CYBERBULLYING CLASSIFICATION:")
print("=" * 50)

train_texts = text_data[train_idx]
test_texts = text_data[test_idx]
train_labels = target[train_idx].values
test_labels = target[test_idx].values

pred, pred_probs = train_transformer_model(
    'unitary/toxic-bert',
    BertTokenizer,
    BertForSequenceClassification,
    train_texts,
    train_labels,
    test_texts,
    test_labels,
    num_labels=2,
    task_name='binary_classification'
)

metrics = save_metrics(
    test_labels, pred, pred_probs, 
    ['Non-CB', 'CB'], 
    'binary_classification'
)

print(f"CyberBERT: Acc={metrics['accuracy']:.3f}, Balanced Acc={metrics['balanced_accuracy']:.3f}, F1={metrics['f1_weighted']:.3f}, AUROC={metrics['auroc']:.3f}")
print(classification_report(test_labels, pred, target_names=['Non-CB', 'CB']))

BINARY CYBERBULLYING CLASSIFICATION:


Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/home/jsunayana/miniconda3/envs/privatefl2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/jsunayana/miniconda3/envs/privatefl2/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jsunayana/miniconda3/envs/privatefl2/lib/python3.8/site-packages/wandb/sdk/internal/system/system_monitor.py", line 118, in _start
    asset.start()
  File "/home/jsunayana/miniconda3/envs/privatefl2/lib/python3.8/site-packages/wandb/sdk/internal/system/assets/cpu.py", line 166, in start
    self.metrics_monitor.start()
  File "/home/jsunayana/miniconda3/envs/privatefl2/lib/python3.8/site-packages/wandb/sdk/internal/system/assets/interfaces.py", line 168, in start
    logger.info(f"Started {self._process.name}")
AttributeError: 'NoneType' object has no attribute 'name'
Some weights of BertForSequenceClassification were not initialized from 

  0%|          | 0/106620 [00:00<?, ?it/s]

{'loss': 0.619, 'grad_norm': 10.551852226257324, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 0.5359, 'grad_norm': 10.227927207946777, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 0.4233, 'grad_norm': 3.2449898719787598, 'learning_rate': 3e-05, 'epoch': 0.01}
{'loss': 0.2803, 'grad_norm': 3.181812286376953, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 0.3139, 'grad_norm': 1.503793478012085, 'learning_rate': 5e-05, 'epoch': 0.01}
{'loss': 0.2831, 'grad_norm': 1.4028441905975342, 'learning_rate': 6e-05, 'epoch': 0.01}
{'loss': 0.2669, 'grad_norm': 4.312163352966309, 'learning_rate': 7.000000000000001e-05, 'epoch': 0.01}
{'loss': 0.2855, 'grad_norm': 0.9242307543754578, 'learning_rate': 8e-05, 'epoch': 0.02}
{'loss': 0.2192, 'grad_norm': 2.0670363903045654, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.02}
{'loss': 0.1842, 'grad_norm': 0.8820130228996277, 'learning_rate': 0.0001, 'epoch': 0.02}
{'loss': 0.211, 'grad_norm': 1.368399977684021, 'learning_rate': 0.00011, 'epoch': 

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.2003181278705597, 'eval_runtime': 438.3014, 'eval_samples_per_second': 48.656, 'eval_steps_per_second': 3.041, 'epoch': 1.0}
{'loss': 0.2707, 'grad_norm': 1.7272454500198364, 'learning_rate': 0.00047719562759140596, 'epoch': 1.0}
{'loss': 0.2095, 'grad_norm': 3.112988233566284, 'learning_rate': 0.0004771485111194874, 'epoch': 1.0}
{'loss': 0.2072, 'grad_norm': 1.836390495300293, 'learning_rate': 0.0004771013946475688, 'epoch': 1.01}
{'loss': 0.2158, 'grad_norm': 1.1827125549316406, 'learning_rate': 0.00047705427817565024, 'epoch': 1.01}
{'loss': 0.2305, 'grad_norm': 1.6507956981658936, 'learning_rate': 0.00047700716170373167, 'epoch': 1.01}
{'loss': 0.2044, 'grad_norm': 1.812248945236206, 'learning_rate': 0.0004769600452318131, 'epoch': 1.01}
{'loss': 0.1803, 'grad_norm': 0.46096205711364746, 'learning_rate': 0.0004769129287598945, 'epoch': 1.01}
{'loss': 0.1174, 'grad_norm': 1.4392093420028687, 'learning_rate': 0.00047686581228797585, 'epoch': 1.01}
{'loss': 0.2486, 'g

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.210862398147583, 'eval_runtime': 437.0928, 'eval_samples_per_second': 48.791, 'eval_steps_per_second': 3.05, 'epoch': 2.0}
{'loss': 0.21, 'grad_norm': 1.4171987771987915, 'learning_rate': 0.0004520825480588014, 'epoch': 2.0}
{'loss': 0.2371, 'grad_norm': 1.618318796157837, 'learning_rate': 0.00045203543158688276, 'epoch': 2.0}
{'loss': 0.1864, 'grad_norm': 1.3031116724014282, 'learning_rate': 0.0004519883151149642, 'epoch': 2.01}
{'loss': 0.2291, 'grad_norm': 2.426844358444214, 'learning_rate': 0.0004519411986430456, 'epoch': 2.01}
{'loss': 0.1651, 'grad_norm': 2.340451240539551, 'learning_rate': 0.00045189408217112705, 'epoch': 2.01}
{'loss': 0.1773, 'grad_norm': 2.0148258209228516, 'learning_rate': 0.0004518469656992085, 'epoch': 2.01}
{'loss': 0.1919, 'grad_norm': 1.1446012258529663, 'learning_rate': 0.0004517998492272899, 'epoch': 2.01}
{'loss': 0.1538, 'grad_norm': 2.0371687412261963, 'learning_rate': 0.0004517527327553713, 'epoch': 2.01}
{'loss': 0.2232, 'grad_nor

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.19939085841178894, 'eval_runtime': 442.4272, 'eval_samples_per_second': 48.202, 'eval_steps_per_second': 3.013, 'epoch': 3.0}
{'loss': 0.1896, 'grad_norm': 1.9554569721221924, 'learning_rate': 0.00042696946852619676, 'epoch': 3.0}
{'loss': 0.1918, 'grad_norm': 0.764250636100769, 'learning_rate': 0.0004269223520542782, 'epoch': 3.0}
{'loss': 0.2133, 'grad_norm': 0.4658936858177185, 'learning_rate': 0.00042687523558235956, 'epoch': 3.01}
{'loss': 0.2024, 'grad_norm': 7.315434455871582, 'learning_rate': 0.000426828119110441, 'epoch': 3.01}
{'loss': 0.2243, 'grad_norm': 2.5405354499816895, 'learning_rate': 0.0004267810026385224, 'epoch': 3.01}
{'loss': 0.1823, 'grad_norm': 0.8124902248382568, 'learning_rate': 0.00042673388616660385, 'epoch': 3.01}
{'loss': 0.1786, 'grad_norm': 5.388472557067871, 'learning_rate': 0.0004266867696946853, 'epoch': 3.01}
{'loss': 0.2286, 'grad_norm': 1.9256083965301514, 'learning_rate': 0.0004266396532227667, 'epoch': 3.01}
{'loss': 0.1364, 'gra

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.24657990038394928, 'eval_runtime': 432.5695, 'eval_samples_per_second': 49.301, 'eval_steps_per_second': 3.082, 'epoch': 4.0}
{'loss': 0.1842, 'grad_norm': 4.963603496551514, 'learning_rate': 0.0004018563889935922, 'epoch': 4.0}
{'loss': 0.1432, 'grad_norm': 0.9328009486198425, 'learning_rate': 0.0004018092725216736, 'epoch': 4.0}
{'loss': 0.26, 'grad_norm': 1.951184630393982, 'learning_rate': 0.00040176215604975505, 'epoch': 4.0}
{'loss': 0.2088, 'grad_norm': 0.3361995816230774, 'learning_rate': 0.00040171503957783637, 'epoch': 4.01}
{'loss': 0.2109, 'grad_norm': 4.250874996185303, 'learning_rate': 0.0004016679231059178, 'epoch': 4.01}
{'loss': 0.1803, 'grad_norm': 1.3499435186386108, 'learning_rate': 0.0004016208066339992, 'epoch': 4.01}
{'loss': 0.1323, 'grad_norm': 1.5758845806121826, 'learning_rate': 0.00040157369016208066, 'epoch': 4.01}
{'loss': 0.2435, 'grad_norm': 3.9707839488983154, 'learning_rate': 0.0004015265736901621, 'epoch': 4.01}
{'loss': 0.1159, 'grad_

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.20326626300811768, 'eval_runtime': 435.9567, 'eval_samples_per_second': 48.918, 'eval_steps_per_second': 3.058, 'epoch': 5.0}
{'loss': 0.1674, 'grad_norm': 0.4096570909023285, 'learning_rate': 0.00037674330946098756, 'epoch': 5.0}
{'loss': 0.2769, 'grad_norm': 2.207322120666504, 'learning_rate': 0.000376696192989069, 'epoch': 5.0}
{'loss': 0.2539, 'grad_norm': 2.6425554752349854, 'learning_rate': 0.0003766490765171504, 'epoch': 5.0}
{'loss': 0.2677, 'grad_norm': 2.401221990585327, 'learning_rate': 0.00037660196004523185, 'epoch': 5.01}
{'loss': 0.172, 'grad_norm': 2.8264811038970947, 'learning_rate': 0.0003765548435733132, 'epoch': 5.01}
{'loss': 0.1993, 'grad_norm': 1.707931637763977, 'learning_rate': 0.00037650772710139466, 'epoch': 5.01}
{'loss': 0.2833, 'grad_norm': 2.553581953048706, 'learning_rate': 0.0003764606106294761, 'epoch': 5.01}
{'loss': 0.2012, 'grad_norm': 2.246614933013916, 'learning_rate': 0.0003764134941575575, 'epoch': 5.01}
{'loss': 0.2553, 'grad_no

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.19946938753128052, 'eval_runtime': 437.5162, 'eval_samples_per_second': 48.743, 'eval_steps_per_second': 3.047, 'epoch': 6.0}
{'loss': 0.2812, 'grad_norm': 1.0849356651306152, 'learning_rate': 0.00035163022992838294, 'epoch': 6.0}
{'loss': 0.1683, 'grad_norm': 2.578824281692505, 'learning_rate': 0.00035158311345646437, 'epoch': 6.0}
{'loss': 0.1242, 'grad_norm': 0.9651637673377991, 'learning_rate': 0.0003515359969845458, 'epoch': 6.0}
{'loss': 0.2292, 'grad_norm': 2.796802043914795, 'learning_rate': 0.0003514888805126272, 'epoch': 6.01}
{'loss': 0.1863, 'grad_norm': 6.363150596618652, 'learning_rate': 0.00035144176404070866, 'epoch': 6.01}
{'loss': 0.2066, 'grad_norm': 0.23292522132396698, 'learning_rate': 0.0003513946475687901, 'epoch': 6.01}
{'loss': 0.171, 'grad_norm': 0.5104144811630249, 'learning_rate': 0.00035134753109687146, 'epoch': 6.01}
{'loss': 0.1524, 'grad_norm': 2.507721185684204, 'learning_rate': 0.0003513004146249529, 'epoch': 6.01}
{'loss': 0.146, 'grad

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.19507671892642975, 'eval_runtime': 435.1696, 'eval_samples_per_second': 49.006, 'eval_steps_per_second': 3.063, 'epoch': 7.0}
{'loss': 0.2182, 'grad_norm': 1.975480556488037, 'learning_rate': 0.00032651715039577837, 'epoch': 7.0}
{'loss': 0.1975, 'grad_norm': 1.8051098585128784, 'learning_rate': 0.00032647003392385974, 'epoch': 7.0}
{'loss': 0.1218, 'grad_norm': 2.133974075317383, 'learning_rate': 0.00032642291745194117, 'epoch': 7.0}
{'loss': 0.2187, 'grad_norm': 1.9621402025222778, 'learning_rate': 0.0003263758009800226, 'epoch': 7.01}
{'loss': 0.1547, 'grad_norm': 1.9534105062484741, 'learning_rate': 0.00032632868450810403, 'epoch': 7.01}
{'loss': 0.2277, 'grad_norm': 3.376445770263672, 'learning_rate': 0.00032628156803618546, 'epoch': 7.01}
{'loss': 0.2298, 'grad_norm': 2.035693407058716, 'learning_rate': 0.0003262344515642669, 'epoch': 7.01}
{'loss': 0.1414, 'grad_norm': 1.978919267654419, 'learning_rate': 0.00032618733509234826, 'epoch': 7.01}
{'loss': 0.2235, 'gr

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.2043159157037735, 'eval_runtime': 437.8023, 'eval_samples_per_second': 48.711, 'eval_steps_per_second': 3.045, 'epoch': 8.0}
{'loss': 0.214, 'grad_norm': 1.267340898513794, 'learning_rate': 0.0003014040708631738, 'epoch': 8.0}
{'loss': 0.1928, 'grad_norm': 0.8862493634223938, 'learning_rate': 0.00030135695439125523, 'epoch': 8.0}
{'loss': 0.2311, 'grad_norm': 1.117112159729004, 'learning_rate': 0.0003013098379193366, 'epoch': 8.0}
{'loss': 0.1234, 'grad_norm': 3.5795645713806152, 'learning_rate': 0.00030126272144741803, 'epoch': 8.01}
{'loss': 0.1218, 'grad_norm': 0.9309770464897156, 'learning_rate': 0.00030121560497549946, 'epoch': 8.01}
{'loss': 0.2052, 'grad_norm': 3.197632074356079, 'learning_rate': 0.00030116848850358084, 'epoch': 8.01}
{'loss': 0.2519, 'grad_norm': 2.8528902530670166, 'learning_rate': 0.00030112137203166226, 'epoch': 8.01}
{'loss': 0.1775, 'grad_norm': 1.6560615301132202, 'learning_rate': 0.0003010742555597437, 'epoch': 8.01}
{'loss': 0.105, 'grad

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.19159966707229614, 'eval_runtime': 430.8233, 'eval_samples_per_second': 49.501, 'eval_steps_per_second': 3.094, 'epoch': 9.0}
{'loss': 0.1612, 'grad_norm': 1.7702792882919312, 'learning_rate': 0.0002762909913305692, 'epoch': 9.0}
{'loss': 0.1661, 'grad_norm': 2.7012267112731934, 'learning_rate': 0.0002762438748586506, 'epoch': 9.0}
{'loss': 0.2057, 'grad_norm': 2.635807752609253, 'learning_rate': 0.00027619675838673203, 'epoch': 9.0}
{'loss': 0.1817, 'grad_norm': 2.4190733432769775, 'learning_rate': 0.0002761496419148134, 'epoch': 9.01}
{'loss': 0.235, 'grad_norm': 3.081094741821289, 'learning_rate': 0.00027610252544289484, 'epoch': 9.01}
{'loss': 0.2145, 'grad_norm': 2.957659959793091, 'learning_rate': 0.00027605540897097626, 'epoch': 9.01}
{'loss': 0.1535, 'grad_norm': 1.967315673828125, 'learning_rate': 0.0002760082924990577, 'epoch': 9.01}
{'loss': 0.2026, 'grad_norm': 2.2582013607025146, 'learning_rate': 0.0002759611760271391, 'epoch': 9.01}
{'loss': 0.19, 'grad_no

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.2229248434305191, 'eval_runtime': 435.3534, 'eval_samples_per_second': 48.985, 'eval_steps_per_second': 3.062, 'epoch': 10.0}
{'loss': 0.269, 'grad_norm': 1.977571964263916, 'learning_rate': 0.000251130795326046, 'epoch': 10.0}
{'loss': 0.2204, 'grad_norm': 1.944928526878357, 'learning_rate': 0.0002510836788541274, 'epoch': 10.0}
{'loss': 0.2456, 'grad_norm': 1.324451208114624, 'learning_rate': 0.00025103656238220884, 'epoch': 10.01}
{'loss': 0.2057, 'grad_norm': 1.5387680530548096, 'learning_rate': 0.0002509894459102902, 'epoch': 10.01}
{'loss': 0.2515, 'grad_norm': 1.1021411418914795, 'learning_rate': 0.00025094232943837164, 'epoch': 10.01}
{'loss': 0.341, 'grad_norm': 1.6061040163040161, 'learning_rate': 0.00025089521296645307, 'epoch': 10.01}
{'loss': 0.1422, 'grad_norm': 4.057720184326172, 'learning_rate': 0.0002508480964945345, 'epoch': 10.01}
{'loss': 0.1822, 'grad_norm': 0.8171206116676331, 'learning_rate': 0.00025080098002261593, 'epoch': 10.02}
{'loss': 0.1955

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.20420454442501068, 'eval_runtime': 428.7956, 'eval_samples_per_second': 49.735, 'eval_steps_per_second': 3.109, 'epoch': 11.0}
{'loss': 0.2274, 'grad_norm': 2.22005033493042, 'learning_rate': 0.00022601771579344138, 'epoch': 11.0}
{'loss': 0.1937, 'grad_norm': 2.863781452178955, 'learning_rate': 0.0002259705993215228, 'epoch': 11.0}
{'loss': 0.1767, 'grad_norm': 2.620215892791748, 'learning_rate': 0.00022592348284960424, 'epoch': 11.01}
{'loss': 0.216, 'grad_norm': 2.576737642288208, 'learning_rate': 0.00022587636637768564, 'epoch': 11.01}
{'loss': 0.1822, 'grad_norm': 2.1443629264831543, 'learning_rate': 0.00022582924990576707, 'epoch': 11.01}
{'loss': 0.2005, 'grad_norm': 1.1662276983261108, 'learning_rate': 0.0002257821334338485, 'epoch': 11.01}
{'loss': 0.1947, 'grad_norm': 2.73651123046875, 'learning_rate': 0.00022573501696192987, 'epoch': 11.01}
{'loss': 0.1766, 'grad_norm': 2.3726863861083984, 'learning_rate': 0.0002256879004900113, 'epoch': 11.01}
{'loss': 0.214

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.1932877153158188, 'eval_runtime': 425.0744, 'eval_samples_per_second': 50.17, 'eval_steps_per_second': 3.136, 'epoch': 12.0}
{'loss': 0.1495, 'grad_norm': 1.338089942932129, 'learning_rate': 0.0002009046362608368, 'epoch': 12.0}
{'loss': 0.1539, 'grad_norm': 0.41786324977874756, 'learning_rate': 0.00020085751978891818, 'epoch': 12.0}
{'loss': 0.2155, 'grad_norm': 5.9369049072265625, 'learning_rate': 0.0002008104033169996, 'epoch': 12.01}
{'loss': 0.3097, 'grad_norm': 2.3520407676696777, 'learning_rate': 0.00020076328684508104, 'epoch': 12.01}
{'loss': 0.2537, 'grad_norm': 1.7520674467086792, 'learning_rate': 0.00020071617037316244, 'epoch': 12.01}
{'loss': 0.19, 'grad_norm': 0.7273322939872742, 'learning_rate': 0.00020066905390124387, 'epoch': 12.01}
{'loss': 0.2633, 'grad_norm': 2.685122489929199, 'learning_rate': 0.0002006219374293253, 'epoch': 12.01}
{'loss': 0.1743, 'grad_norm': 3.1045141220092773, 'learning_rate': 0.0002005748209574067, 'epoch': 12.01}
{'loss': 0.1

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.21303722262382507, 'eval_runtime': 434.1036, 'eval_samples_per_second': 49.127, 'eval_steps_per_second': 3.071, 'epoch': 13.0}
{'loss': 0.307, 'grad_norm': 2.6507747173309326, 'learning_rate': 0.00017579155672823218, 'epoch': 13.0}
{'loss': 0.1805, 'grad_norm': 3.1316592693328857, 'learning_rate': 0.0001757444402563136, 'epoch': 13.0}
{'loss': 0.1201, 'grad_norm': 1.0822663307189941, 'learning_rate': 0.00017569732378439504, 'epoch': 13.01}
{'loss': 0.227, 'grad_norm': 2.850128650665283, 'learning_rate': 0.00017565020731247644, 'epoch': 13.01}
{'loss': 0.1414, 'grad_norm': 0.7071648836135864, 'learning_rate': 0.00017560309084055787, 'epoch': 13.01}
{'loss': 0.2405, 'grad_norm': 1.6145433187484741, 'learning_rate': 0.0001755559743686393, 'epoch': 13.01}
{'loss': 0.2453, 'grad_norm': 2.706183671951294, 'learning_rate': 0.0001755088578967207, 'epoch': 13.01}
{'loss': 0.1687, 'grad_norm': 2.029346466064453, 'learning_rate': 0.0001754617414248021, 'epoch': 13.01}
{'loss': 0.2

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.18840676546096802, 'eval_runtime': 433.531, 'eval_samples_per_second': 49.191, 'eval_steps_per_second': 3.075, 'epoch': 14.0}
{'loss': 0.2042, 'grad_norm': 1.8022170066833496, 'learning_rate': 0.00015067847719562761, 'epoch': 14.0}
{'loss': 0.3661, 'grad_norm': 2.138007640838623, 'learning_rate': 0.00015063136072370902, 'epoch': 14.0}
{'loss': 0.1241, 'grad_norm': 2.427042245864868, 'learning_rate': 0.00015058424425179042, 'epoch': 14.0}
{'loss': 0.2123, 'grad_norm': 1.7100346088409424, 'learning_rate': 0.00015053712777987185, 'epoch': 14.01}
{'loss': 0.2373, 'grad_norm': 4.0766191482543945, 'learning_rate': 0.00015049001130795325, 'epoch': 14.01}
{'loss': 0.1777, 'grad_norm': 6.028755187988281, 'learning_rate': 0.00015044289483603468, 'epoch': 14.01}
{'loss': 0.1288, 'grad_norm': 1.3310757875442505, 'learning_rate': 0.0001503957783641161, 'epoch': 14.01}
{'loss': 0.2209, 'grad_norm': 0.8886141180992126, 'learning_rate': 0.0001503486618921975, 'epoch': 14.01}
{'loss': 0

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.18501071631908417, 'eval_runtime': 430.6913, 'eval_samples_per_second': 49.516, 'eval_steps_per_second': 3.095, 'epoch': 15.0}
{'loss': 0.199, 'grad_norm': 1.9318748712539673, 'learning_rate': 0.000125565397663023, 'epoch': 15.0}
{'loss': 0.1015, 'grad_norm': 0.636957585811615, 'learning_rate': 0.00012551828119110442, 'epoch': 15.0}
{'loss': 0.2171, 'grad_norm': 3.4105076789855957, 'learning_rate': 0.00012547116471918582, 'epoch': 15.0}
{'loss': 0.168, 'grad_norm': 1.492429256439209, 'learning_rate': 0.00012542404824726725, 'epoch': 15.01}
{'loss': 0.1883, 'grad_norm': 4.113306522369385, 'learning_rate': 0.00012537693177534868, 'epoch': 15.01}
{'loss': 0.194, 'grad_norm': 1.2219220399856567, 'learning_rate': 0.00012532981530343008, 'epoch': 15.01}
{'loss': 0.242, 'grad_norm': 4.023467063903809, 'learning_rate': 0.0001252826988315115, 'epoch': 15.01}
{'loss': 0.2788, 'grad_norm': 2.402604818344116, 'learning_rate': 0.00012523558235959294, 'epoch': 15.01}
{'loss': 0.1017,

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.18850310146808624, 'eval_runtime': 440.543, 'eval_samples_per_second': 48.408, 'eval_steps_per_second': 3.026, 'epoch': 16.0}
{'loss': 0.23, 'grad_norm': 2.2958006858825684, 'learning_rate': 0.0001004523181304184, 'epoch': 16.0}
{'loss': 0.1673, 'grad_norm': 0.8034796118736267, 'learning_rate': 0.0001004052016584998, 'epoch': 16.0}
{'loss': 0.1868, 'grad_norm': 2.9300365447998047, 'learning_rate': 0.00010035808518658122, 'epoch': 16.0}
{'loss': 0.1319, 'grad_norm': 1.4313580989837646, 'learning_rate': 0.00010031096871466265, 'epoch': 16.01}
{'loss': 0.1493, 'grad_norm': 3.272050380706787, 'learning_rate': 0.00010026385224274407, 'epoch': 16.01}
{'loss': 0.2203, 'grad_norm': 2.6893866062164307, 'learning_rate': 0.00010021673577082548, 'epoch': 16.01}
{'loss': 0.239, 'grad_norm': 1.509513020515442, 'learning_rate': 0.0001001696192989069, 'epoch': 16.01}
{'loss': 0.1853, 'grad_norm': 1.4624990224838257, 'learning_rate': 0.00010012250282698831, 'epoch': 16.01}
{'loss': 0.11

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.20024529099464417, 'eval_runtime': 432.6554, 'eval_samples_per_second': 49.291, 'eval_steps_per_second': 3.081, 'epoch': 17.0}
{'loss': 0.1482, 'grad_norm': 2.5291333198547363, 'learning_rate': 7.533923859781381e-05, 'epoch': 17.0}
{'loss': 0.2349, 'grad_norm': 1.9917300939559937, 'learning_rate': 7.529212212589521e-05, 'epoch': 17.0}
{'loss': 0.2079, 'grad_norm': 2.186173439025879, 'learning_rate': 7.524500565397662e-05, 'epoch': 17.0}
{'loss': 0.1791, 'grad_norm': 2.349876642227173, 'learning_rate': 7.519788918205805e-05, 'epoch': 17.01}
{'loss': 0.2803, 'grad_norm': 3.8230843544006348, 'learning_rate': 7.515077271013947e-05, 'epoch': 17.01}
{'loss': 0.1777, 'grad_norm': 2.353231191635132, 'learning_rate': 7.510365623822088e-05, 'epoch': 17.01}
{'loss': 0.1393, 'grad_norm': 0.9801977276802063, 'learning_rate': 7.50565397663023e-05, 'epoch': 17.01}
{'loss': 0.1822, 'grad_norm': 0.7753633260726929, 'learning_rate': 7.500942329438372e-05, 'epoch': 17.01}
{'loss': 0.099, 

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.1910305619239807, 'eval_runtime': 440.2374, 'eval_samples_per_second': 48.442, 'eval_steps_per_second': 3.028, 'epoch': 18.0}
{'loss': 0.2265, 'grad_norm': 1.2651890516281128, 'learning_rate': 5.02261590652092e-05, 'epoch': 18.0}
{'loss': 0.1835, 'grad_norm': 0.502763032913208, 'learning_rate': 5.017904259329061e-05, 'epoch': 18.0}
{'loss': 0.1109, 'grad_norm': 0.6480960249900818, 'learning_rate': 5.0131926121372033e-05, 'epoch': 18.0}
{'loss': 0.1882, 'grad_norm': 3.0027406215667725, 'learning_rate': 5.008480964945345e-05, 'epoch': 18.01}
{'loss': 0.125, 'grad_norm': 1.0325740575790405, 'learning_rate': 5.0037693177534865e-05, 'epoch': 18.01}
{'loss': 0.1419, 'grad_norm': 0.5582864880561829, 'learning_rate': 4.999057670561629e-05, 'epoch': 18.01}
{'loss': 0.1894, 'grad_norm': 1.6227360963821411, 'learning_rate': 4.99434602336977e-05, 'epoch': 18.01}
{'loss': 0.1661, 'grad_norm': 0.46534937620162964, 'learning_rate': 4.989634376177912e-05, 'epoch': 18.01}
{'loss': 0.181

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.19041870534420013, 'eval_runtime': 430.7497, 'eval_samples_per_second': 49.509, 'eval_steps_per_second': 3.095, 'epoch': 19.0}
{'loss': 0.2598, 'grad_norm': 4.472591400146484, 'learning_rate': 2.51130795326046e-05, 'epoch': 19.0}
{'loss': 0.3318, 'grad_norm': 2.2430083751678467, 'learning_rate': 2.5065963060686017e-05, 'epoch': 19.0}
{'loss': 0.2015, 'grad_norm': 3.2853078842163086, 'learning_rate': 2.5018846588767432e-05, 'epoch': 19.0}
{'loss': 0.1689, 'grad_norm': 0.855925977230072, 'learning_rate': 2.497173011684885e-05, 'epoch': 19.01}
{'loss': 0.2555, 'grad_norm': 1.7691245079040527, 'learning_rate': 2.4924613644930267e-05, 'epoch': 19.01}
{'loss': 0.1906, 'grad_norm': 2.2104618549346924, 'learning_rate': 2.4877497173011682e-05, 'epoch': 19.01}
{'loss': 0.0991, 'grad_norm': 1.0162296295166016, 'learning_rate': 2.4830380701093105e-05, 'epoch': 19.01}
{'loss': 0.2288, 'grad_norm': 2.3039448261260986, 'learning_rate': 2.478326422917452e-05, 'epoch': 19.01}
{'loss': 0

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.1884542554616928, 'eval_runtime': 430.8975, 'eval_samples_per_second': 49.492, 'eval_steps_per_second': 3.094, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.La

{'train_runtime': 52906.5228, 'train_samples_per_second': 32.243, 'train_steps_per_second': 2.015, 'train_loss': 0.19677243474622846, 'epoch': 20.0}


  0%|          | 0/1333 [00:00<?, ?it/s]

Saved metrics for binary_classification to results/cyberbert
CyberBERT: Acc=0.927, Balanced Acc=0.769, F1=0.923, AUROC=0.938
              precision    recall  f1-score   support

      Non-CB       0.95      0.97      0.96     18927
          CB       0.73      0.56      0.64      2399

    accuracy                           0.93     21326
   macro avg       0.84      0.77      0.80     21326
weighted avg       0.92      0.93      0.92     21326



In [20]:
print("\nSEVERITY CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):")
print("=" * 60)

cb_mask = target == 1
severity_names = ['mild', 'moderate', 'severe']

train_texts_sev = text_data[train_idx & cb_mask]
test_texts_sev = text_data[test_idx & cb_mask]
train_labels_sev = severity_labels[train_idx & cb_mask] - 1  # Convert 1,2,3 to 0,1,2
test_labels_sev = severity_labels[test_idx & cb_mask] - 1    # Convert 1,2,3 to 0,1,2

# Filter out any remaining 'none' labels (originally 0, now -1)
valid_train_mask = train_labels_sev >= 0
valid_test_mask = test_labels_sev >= 0

train_texts_sev = train_texts_sev[valid_train_mask]
test_texts_sev = test_texts_sev[valid_test_mask]
train_labels_sev = train_labels_sev[valid_train_mask]
test_labels_sev = test_labels_sev[valid_test_mask]

if len(train_labels_sev) > 0 and len(test_labels_sev) > 0:
    pred_s, pred_probs_s = train_transformer_model(
        'unitary/toxic-bert',
        BertTokenizer,
        BertForSequenceClassification,
        train_texts_sev,
        train_labels_sev,
        test_texts_sev,
        test_labels_sev,
        num_labels=3,
        task_name='severity_classification'
    )
    
    metrics_s = save_metrics(
        test_labels_sev, pred_s, pred_probs_s,
        severity_names,
        'severity_classification'
    )
    
    print(f"CyberBERT: Acc={metrics_s['accuracy']:.3f}, Balanced Acc={metrics_s['balanced_accuracy']:.3f}, F1={metrics_s['f1_weighted']:.3f}")
    if metrics_s['auroc_weighted']:
        print(f"AUROC Weighted={metrics_s['auroc_weighted']:.3f}")
    print(classification_report(test_labels_sev, pred_s, target_names=severity_names, zero_division=0))
else:
    print("CyberBERT: No data available")


SEVERITY CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unitary/toxic-bert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/11100 [00:00<?, ?it/s]

{'loss': 1.0559, 'grad_norm': 7.98483419418335, 'learning_rate': 1e-05, 'epoch': 0.02}
{'loss': 0.9665, 'grad_norm': 6.100740432739258, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.8951, 'grad_norm': 4.961310386657715, 'learning_rate': 3e-05, 'epoch': 0.05}
{'loss': 0.8193, 'grad_norm': 4.548985004425049, 'learning_rate': 4e-05, 'epoch': 0.07}
{'loss': 0.7661, 'grad_norm': 4.029865741729736, 'learning_rate': 5e-05, 'epoch': 0.09}
{'loss': 0.675, 'grad_norm': 5.8566203117370605, 'learning_rate': 6e-05, 'epoch': 0.11}
{'loss': 0.7158, 'grad_norm': 4.430349349975586, 'learning_rate': 7.000000000000001e-05, 'epoch': 0.13}
{'loss': 0.6678, 'grad_norm': 3.6121349334716797, 'learning_rate': 8e-05, 'epoch': 0.14}
{'loss': 0.6869, 'grad_norm': 2.0760090351104736, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.16}
{'loss': 0.7114, 'grad_norm': 5.729135990142822, 'learning_rate': 0.0001, 'epoch': 0.18}
{'loss': 0.7019, 'grad_norm': 2.2787322998046875, 'learning_rate': 0.00011, 'epoch': 0

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.7317465543746948, 'eval_runtime': 48.5801, 'eval_samples_per_second': 49.382, 'eval_steps_per_second': 3.088, 'epoch': 1.0}
{'loss': 0.5584, 'grad_norm': 2.625058174133301, 'learning_rate': 0.0004971698113207547, 'epoch': 1.01}
{'loss': 0.7332, 'grad_norm': 4.431631088256836, 'learning_rate': 0.0004966981132075471, 'epoch': 1.03}
{'loss': 0.6485, 'grad_norm': 3.60115647315979, 'learning_rate': 0.0004962264150943397, 'epoch': 1.05}
{'loss': 0.6714, 'grad_norm': 3.5623698234558105, 'learning_rate': 0.0004957547169811321, 'epoch': 1.06}
{'loss': 0.6599, 'grad_norm': 3.090235948562622, 'learning_rate': 0.0004952830188679246, 'epoch': 1.08}
{'loss': 0.5917, 'grad_norm': 9.140852928161621, 'learning_rate': 0.000494811320754717, 'epoch': 1.1}
{'loss': 0.7101, 'grad_norm': 6.702017307281494, 'learning_rate': 0.0004943396226415094, 'epoch': 1.12}
{'loss': 0.6903, 'grad_norm': 2.718744993209839, 'learning_rate': 0.0004938679245283018, 'epoch': 1.14}
{'loss': 0.7072, 'grad_norm': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6800710558891296, 'eval_runtime': 48.4597, 'eval_samples_per_second': 49.505, 'eval_steps_per_second': 3.095, 'epoch': 2.0}
{'loss': 0.6554, 'grad_norm': 3.486132860183716, 'learning_rate': 0.0004707547169811321, 'epoch': 2.02}
{'loss': 0.7712, 'grad_norm': 5.652403831481934, 'learning_rate': 0.0004702830188679245, 'epoch': 2.04}
{'loss': 0.6318, 'grad_norm': 6.360678195953369, 'learning_rate': 0.000469811320754717, 'epoch': 2.05}
{'loss': 0.6166, 'grad_norm': 2.763584852218628, 'learning_rate': 0.0004693396226415094, 'epoch': 2.07}
{'loss': 0.745, 'grad_norm': 2.6932871341705322, 'learning_rate': 0.0004688679245283019, 'epoch': 2.09}
{'loss': 0.6585, 'grad_norm': 3.712209463119507, 'learning_rate': 0.00046839622641509435, 'epoch': 2.11}
{'loss': 0.6717, 'grad_norm': 4.630842208862305, 'learning_rate': 0.0004679245283018868, 'epoch': 2.13}
{'loss': 0.663, 'grad_norm': 2.3926849365234375, 'learning_rate': 0.00046745283018867923, 'epoch': 2.14}
{'loss': 0.5904, 'grad_norm

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6749964952468872, 'eval_runtime': 48.0664, 'eval_samples_per_second': 49.91, 'eval_steps_per_second': 3.121, 'epoch': 3.0}
{'loss': 0.7391, 'grad_norm': 4.751293182373047, 'learning_rate': 0.000444811320754717, 'epoch': 3.01}
{'loss': 0.6642, 'grad_norm': 5.624339580535889, 'learning_rate': 0.00044433962264150945, 'epoch': 3.03}
{'loss': 0.5865, 'grad_norm': 2.086132526397705, 'learning_rate': 0.00044386792452830187, 'epoch': 3.05}
{'loss': 0.6576, 'grad_norm': 2.9908628463745117, 'learning_rate': 0.00044339622641509434, 'epoch': 3.06}
{'loss': 0.5744, 'grad_norm': 2.0425803661346436, 'learning_rate': 0.0004429245283018868, 'epoch': 3.08}
{'loss': 0.6264, 'grad_norm': 3.087027072906494, 'learning_rate': 0.0004424528301886793, 'epoch': 3.1}
{'loss': 0.7438, 'grad_norm': 5.119567394256592, 'learning_rate': 0.0004419811320754717, 'epoch': 3.12}
{'loss': 0.6664, 'grad_norm': 2.734118700027466, 'learning_rate': 0.00044150943396226416, 'epoch': 3.14}
{'loss': 0.7193, 'grad_no

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6934719085693359, 'eval_runtime': 49.2593, 'eval_samples_per_second': 48.701, 'eval_steps_per_second': 3.045, 'epoch': 4.0}
{'loss': 0.7334, 'grad_norm': 3.9527480602264404, 'learning_rate': 0.0004183962264150943, 'epoch': 4.02}
{'loss': 0.6116, 'grad_norm': 5.951802730560303, 'learning_rate': 0.0004179245283018868, 'epoch': 4.04}
{'loss': 0.6752, 'grad_norm': 3.0396475791931152, 'learning_rate': 0.00041745283018867926, 'epoch': 4.05}
{'loss': 0.6653, 'grad_norm': 5.670693874359131, 'learning_rate': 0.00041698113207547173, 'epoch': 4.07}
{'loss': 0.7061, 'grad_norm': 2.0427420139312744, 'learning_rate': 0.00041650943396226415, 'epoch': 4.09}
{'loss': 0.6078, 'grad_norm': 6.839905261993408, 'learning_rate': 0.0004160377358490566, 'epoch': 4.11}
{'loss': 0.5871, 'grad_norm': 4.609366416931152, 'learning_rate': 0.00041556603773584903, 'epoch': 4.13}
{'loss': 0.6609, 'grad_norm': 6.865567207336426, 'learning_rate': 0.00041509433962264155, 'epoch': 4.14}
{'loss': 0.6233, 'gr

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6820457577705383, 'eval_runtime': 48.4098, 'eval_samples_per_second': 49.556, 'eval_steps_per_second': 3.099, 'epoch': 5.0}
{'loss': 0.5942, 'grad_norm': 2.611170768737793, 'learning_rate': 0.00039245283018867925, 'epoch': 5.01}
{'loss': 0.6344, 'grad_norm': 8.332110404968262, 'learning_rate': 0.0003919811320754717, 'epoch': 5.03}
{'loss': 0.7395, 'grad_norm': 2.927499294281006, 'learning_rate': 0.0003915094339622642, 'epoch': 5.05}
{'loss': 0.6694, 'grad_norm': 5.2011518478393555, 'learning_rate': 0.0003910377358490566, 'epoch': 5.06}
{'loss': 0.7042, 'grad_norm': 3.087822914123535, 'learning_rate': 0.0003905660377358491, 'epoch': 5.08}
{'loss': 0.7223, 'grad_norm': 3.199282169342041, 'learning_rate': 0.0003900943396226415, 'epoch': 5.1}
{'loss': 0.6159, 'grad_norm': 2.353163242340088, 'learning_rate': 0.000389622641509434, 'epoch': 5.12}
{'loss': 0.64, 'grad_norm': 6.934868812561035, 'learning_rate': 0.0003891509433962264, 'epoch': 5.14}
{'loss': 0.5556, 'grad_norm': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.7317083477973938, 'eval_runtime': 48.6309, 'eval_samples_per_second': 49.331, 'eval_steps_per_second': 3.084, 'epoch': 6.0}
{'loss': 0.696, 'grad_norm': 3.94980788230896, 'learning_rate': 0.0003660377358490566, 'epoch': 6.02}
{'loss': 0.5746, 'grad_norm': 4.065098285675049, 'learning_rate': 0.00036556603773584906, 'epoch': 6.04}
{'loss': 0.6046, 'grad_norm': 6.596282482147217, 'learning_rate': 0.0003650943396226415, 'epoch': 6.05}
{'loss': 0.6844, 'grad_norm': 2.334182024002075, 'learning_rate': 0.00036462264150943395, 'epoch': 6.07}
{'loss': 0.6722, 'grad_norm': 3.9481191635131836, 'learning_rate': 0.0003641509433962264, 'epoch': 6.09}
{'loss': 0.6472, 'grad_norm': 6.42059326171875, 'learning_rate': 0.0003636792452830189, 'epoch': 6.11}
{'loss': 0.6323, 'grad_norm': 4.855895519256592, 'learning_rate': 0.0003632075471698113, 'epoch': 6.13}
{'loss': 0.5827, 'grad_norm': 4.962952613830566, 'learning_rate': 0.00036273584905660377, 'epoch': 6.14}
{'loss': 0.6612, 'grad_norm

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6733406186103821, 'eval_runtime': 48.082, 'eval_samples_per_second': 49.894, 'eval_steps_per_second': 3.12, 'epoch': 7.0}
{'loss': 0.6199, 'grad_norm': 3.914680004119873, 'learning_rate': 0.0003400943396226415, 'epoch': 7.01}
{'loss': 0.6463, 'grad_norm': 3.2461655139923096, 'learning_rate': 0.00033962264150943393, 'epoch': 7.03}
{'loss': 0.7081, 'grad_norm': 6.015085220336914, 'learning_rate': 0.0003391509433962264, 'epoch': 7.05}
{'loss': 0.5728, 'grad_norm': 3.8962323665618896, 'learning_rate': 0.00033867924528301887, 'epoch': 7.06}
{'loss': 0.5833, 'grad_norm': 1.9766432046890259, 'learning_rate': 0.00033820754716981134, 'epoch': 7.08}
{'loss': 0.7653, 'grad_norm': 4.288798809051514, 'learning_rate': 0.00033773584905660376, 'epoch': 7.1}
{'loss': 0.6132, 'grad_norm': 4.294436454772949, 'learning_rate': 0.0003372641509433962, 'epoch': 7.12}
{'loss': 0.6773, 'grad_norm': 5.590301513671875, 'learning_rate': 0.00033679245283018864, 'epoch': 7.14}
{'loss': 0.6382, 'grad_

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.7047117352485657, 'eval_runtime': 47.9351, 'eval_samples_per_second': 50.047, 'eval_steps_per_second': 3.129, 'epoch': 8.0}
{'loss': 0.6043, 'grad_norm': 3.3080203533172607, 'learning_rate': 0.00031367924528301886, 'epoch': 8.02}
{'loss': 0.5961, 'grad_norm': 2.9467391967773438, 'learning_rate': 0.00031320754716981133, 'epoch': 8.04}
{'loss': 0.6363, 'grad_norm': 3.097740650177002, 'learning_rate': 0.0003127358490566038, 'epoch': 8.05}
{'loss': 0.7154, 'grad_norm': 2.5212435722351074, 'learning_rate': 0.0003122641509433962, 'epoch': 8.07}
{'loss': 0.7239, 'grad_norm': 6.211555480957031, 'learning_rate': 0.0003117924528301887, 'epoch': 8.09}
{'loss': 0.6199, 'grad_norm': 5.410491943359375, 'learning_rate': 0.00031132075471698115, 'epoch': 8.11}
{'loss': 0.6296, 'grad_norm': 1.7997336387634277, 'learning_rate': 0.0003108490566037736, 'epoch': 8.13}
{'loss': 0.6471, 'grad_norm': 5.85835075378418, 'learning_rate': 0.00031037735849056604, 'epoch': 8.14}
{'loss': 0.6891, 'gra

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6804494261741638, 'eval_runtime': 48.0653, 'eval_samples_per_second': 49.911, 'eval_steps_per_second': 3.121, 'epoch': 9.0}
{'loss': 0.6545, 'grad_norm': 3.8708081245422363, 'learning_rate': 0.0002877358490566038, 'epoch': 9.01}
{'loss': 0.6241, 'grad_norm': 7.861223220825195, 'learning_rate': 0.00028726415094339626, 'epoch': 9.03}
{'loss': 0.5765, 'grad_norm': 2.1273601055145264, 'learning_rate': 0.00028679245283018867, 'epoch': 9.05}
{'loss': 0.6849, 'grad_norm': 6.439621448516846, 'learning_rate': 0.00028632075471698114, 'epoch': 9.06}
{'loss': 0.6449, 'grad_norm': 5.062257766723633, 'learning_rate': 0.0002858490566037736, 'epoch': 9.08}
{'loss': 0.6868, 'grad_norm': 2.961735963821411, 'learning_rate': 0.0002853773584905661, 'epoch': 9.1}
{'loss': 0.641, 'grad_norm': 4.2921833992004395, 'learning_rate': 0.0002849056603773585, 'epoch': 9.12}
{'loss': 0.6248, 'grad_norm': 3.7061424255371094, 'learning_rate': 0.00028443396226415096, 'epoch': 9.14}
{'loss': 0.6193, 'grad

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6681577563285828, 'eval_runtime': 48.824, 'eval_samples_per_second': 49.136, 'eval_steps_per_second': 3.072, 'epoch': 10.0}
{'loss': 0.7031, 'grad_norm': 3.281132459640503, 'learning_rate': 0.00026132075471698113, 'epoch': 10.02}
{'loss': 0.5865, 'grad_norm': 4.014396667480469, 'learning_rate': 0.0002608490566037736, 'epoch': 10.04}
{'loss': 0.6343, 'grad_norm': 2.9867594242095947, 'learning_rate': 0.00026037735849056607, 'epoch': 10.05}
{'loss': 0.6423, 'grad_norm': 7.931370735168457, 'learning_rate': 0.00025990566037735854, 'epoch': 10.07}
{'loss': 0.6206, 'grad_norm': 4.9191484451293945, 'learning_rate': 0.00025943396226415095, 'epoch': 10.09}
{'loss': 0.6388, 'grad_norm': 10.369401931762695, 'learning_rate': 0.0002589622641509434, 'epoch': 10.11}
{'loss': 0.6282, 'grad_norm': 1.7745096683502197, 'learning_rate': 0.00025849056603773583, 'epoch': 10.13}
{'loss': 0.6896, 'grad_norm': 5.971843719482422, 'learning_rate': 0.00025801886792452836, 'epoch': 10.14}
{'loss': 0

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6827661991119385, 'eval_runtime': 47.6831, 'eval_samples_per_second': 50.311, 'eval_steps_per_second': 3.146, 'epoch': 11.0}
{'loss': 0.5961, 'grad_norm': 5.119921684265137, 'learning_rate': 0.00023537735849056606, 'epoch': 11.01}
{'loss': 0.6584, 'grad_norm': 3.8275232315063477, 'learning_rate': 0.0002349056603773585, 'epoch': 11.03}
{'loss': 0.5473, 'grad_norm': 5.016979694366455, 'learning_rate': 0.00023443396226415094, 'epoch': 11.05}
{'loss': 0.6358, 'grad_norm': 4.977853775024414, 'learning_rate': 0.0002339622641509434, 'epoch': 11.06}
{'loss': 0.5921, 'grad_norm': 2.9061813354492188, 'learning_rate': 0.00023349056603773585, 'epoch': 11.08}
{'loss': 0.6105, 'grad_norm': 4.830888748168945, 'learning_rate': 0.00023301886792452832, 'epoch': 11.1}
{'loss': 0.5493, 'grad_norm': 2.955144166946411, 'learning_rate': 0.00023254716981132076, 'epoch': 11.12}
{'loss': 0.5955, 'grad_norm': 1.7624460458755493, 'learning_rate': 0.0002320754716981132, 'epoch': 11.14}
{'loss': 0.6

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6648695468902588, 'eval_runtime': 48.4112, 'eval_samples_per_second': 49.555, 'eval_steps_per_second': 3.098, 'epoch': 12.0}
{'loss': 0.6706, 'grad_norm': 5.1141533851623535, 'learning_rate': 0.0002089622641509434, 'epoch': 12.02}
{'loss': 0.7398, 'grad_norm': 3.667295455932617, 'learning_rate': 0.00020849056603773587, 'epoch': 12.04}
{'loss': 0.6076, 'grad_norm': 3.0305678844451904, 'learning_rate': 0.0002080188679245283, 'epoch': 12.05}
{'loss': 0.6703, 'grad_norm': 2.811838388442993, 'learning_rate': 0.00020754716981132078, 'epoch': 12.07}
{'loss': 0.5845, 'grad_norm': 6.3901143074035645, 'learning_rate': 0.00020707547169811322, 'epoch': 12.09}
{'loss': 0.6062, 'grad_norm': 2.4308924674987793, 'learning_rate': 0.0002066037735849057, 'epoch': 12.11}
{'loss': 0.5976, 'grad_norm': 2.521308422088623, 'learning_rate': 0.00020613207547169813, 'epoch': 12.13}
{'loss': 0.5986, 'grad_norm': 4.3686299324035645, 'learning_rate': 0.00020566037735849057, 'epoch': 12.14}
{'loss': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6616606116294861, 'eval_runtime': 47.9026, 'eval_samples_per_second': 50.081, 'eval_steps_per_second': 3.131, 'epoch': 13.0}
{'loss': 0.5677, 'grad_norm': 2.0805013179779053, 'learning_rate': 0.0001830188679245283, 'epoch': 13.01}
{'loss': 0.6305, 'grad_norm': 2.226428985595703, 'learning_rate': 0.00018254716981132074, 'epoch': 13.03}
{'loss': 0.6636, 'grad_norm': 7.602272033691406, 'learning_rate': 0.0001820754716981132, 'epoch': 13.05}
{'loss': 0.6175, 'grad_norm': 4.3136444091796875, 'learning_rate': 0.00018160377358490565, 'epoch': 13.06}
{'loss': 0.6888, 'grad_norm': 3.0252249240875244, 'learning_rate': 0.0001811320754716981, 'epoch': 13.08}
{'loss': 0.5114, 'grad_norm': 2.097714424133301, 'learning_rate': 0.00018066037735849056, 'epoch': 13.1}
{'loss': 0.5367, 'grad_norm': 2.6926562786102295, 'learning_rate': 0.000180188679245283, 'epoch': 13.12}
{'loss': 0.6111, 'grad_norm': 2.479520559310913, 'learning_rate': 0.00017971698113207547, 'epoch': 13.14}
{'loss': 0.55

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6670573949813843, 'eval_runtime': 48.5333, 'eval_samples_per_second': 49.43, 'eval_steps_per_second': 3.091, 'epoch': 14.0}
{'loss': 0.6307, 'grad_norm': 2.375635862350464, 'learning_rate': 0.00015660377358490567, 'epoch': 14.02}
{'loss': 0.6178, 'grad_norm': 3.9679198265075684, 'learning_rate': 0.0001561320754716981, 'epoch': 14.04}
{'loss': 0.6601, 'grad_norm': 5.53093147277832, 'learning_rate': 0.00015566037735849058, 'epoch': 14.05}
{'loss': 0.5928, 'grad_norm': 2.1132500171661377, 'learning_rate': 0.00015518867924528302, 'epoch': 14.07}
{'loss': 0.5916, 'grad_norm': 4.282233715057373, 'learning_rate': 0.00015471698113207546, 'epoch': 14.09}
{'loss': 0.6556, 'grad_norm': 3.2487597465515137, 'learning_rate': 0.00015424528301886793, 'epoch': 14.11}
{'loss': 0.5458, 'grad_norm': 2.8112032413482666, 'learning_rate': 0.00015377358490566037, 'epoch': 14.13}
{'loss': 0.6258, 'grad_norm': 3.9333887100219727, 'learning_rate': 0.00015330188679245284, 'epoch': 14.14}
{'loss': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6721034049987793, 'eval_runtime': 47.8279, 'eval_samples_per_second': 50.159, 'eval_steps_per_second': 3.136, 'epoch': 15.0}
{'loss': 0.6427, 'grad_norm': 3.075535297393799, 'learning_rate': 0.00013066037735849056, 'epoch': 15.01}
{'loss': 0.6428, 'grad_norm': 5.159346580505371, 'learning_rate': 0.00013018867924528303, 'epoch': 15.03}
{'loss': 0.6215, 'grad_norm': 3.722200393676758, 'learning_rate': 0.00012971698113207548, 'epoch': 15.05}
{'loss': 0.5573, 'grad_norm': 3.5145742893218994, 'learning_rate': 0.00012924528301886792, 'epoch': 15.06}
{'loss': 0.5583, 'grad_norm': 5.687236785888672, 'learning_rate': 0.00012877358490566039, 'epoch': 15.08}
{'loss': 0.6374, 'grad_norm': 5.545041561126709, 'learning_rate': 0.00012830188679245283, 'epoch': 15.1}
{'loss': 0.5749, 'grad_norm': 2.474809169769287, 'learning_rate': 0.0001278301886792453, 'epoch': 15.12}
{'loss': 0.5572, 'grad_norm': 2.5556633472442627, 'learning_rate': 0.00012735849056603774, 'epoch': 15.14}
{'loss': 0.

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6672683358192444, 'eval_runtime': 48.1468, 'eval_samples_per_second': 49.827, 'eval_steps_per_second': 3.115, 'epoch': 16.0}
{'loss': 0.6139, 'grad_norm': 3.4103031158447266, 'learning_rate': 0.00010424528301886793, 'epoch': 16.02}
{'loss': 0.5554, 'grad_norm': 3.071549415588379, 'learning_rate': 0.00010377358490566039, 'epoch': 16.04}
{'loss': 0.6815, 'grad_norm': 3.8073537349700928, 'learning_rate': 0.00010330188679245284, 'epoch': 16.05}
{'loss': 0.5455, 'grad_norm': 4.564631938934326, 'learning_rate': 0.00010283018867924529, 'epoch': 16.07}
{'loss': 0.6983, 'grad_norm': 3.838102340698242, 'learning_rate': 0.00010235849056603774, 'epoch': 16.09}
{'loss': 0.5778, 'grad_norm': 2.6132819652557373, 'learning_rate': 0.0001018867924528302, 'epoch': 16.11}
{'loss': 0.6227, 'grad_norm': 3.976233959197998, 'learning_rate': 0.00010141509433962265, 'epoch': 16.13}
{'loss': 0.632, 'grad_norm': 2.2821850776672363, 'learning_rate': 0.00010094339622641511, 'epoch': 16.14}
{'loss': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6624768972396851, 'eval_runtime': 47.7983, 'eval_samples_per_second': 50.19, 'eval_steps_per_second': 3.138, 'epoch': 17.0}
{'loss': 0.5814, 'grad_norm': 2.661224126815796, 'learning_rate': 7.830188679245283e-05, 'epoch': 17.01}
{'loss': 0.584, 'grad_norm': 2.9577934741973877, 'learning_rate': 7.783018867924529e-05, 'epoch': 17.03}
{'loss': 0.5448, 'grad_norm': 4.455996513366699, 'learning_rate': 7.735849056603773e-05, 'epoch': 17.05}
{'loss': 0.5796, 'grad_norm': 1.6103678941726685, 'learning_rate': 7.688679245283019e-05, 'epoch': 17.06}
{'loss': 0.5554, 'grad_norm': 2.8470983505249023, 'learning_rate': 7.641509433962264e-05, 'epoch': 17.08}
{'loss': 0.5875, 'grad_norm': 2.4772820472717285, 'learning_rate': 7.59433962264151e-05, 'epoch': 17.1}
{'loss': 0.6853, 'grad_norm': 5.656338691711426, 'learning_rate': 7.547169811320755e-05, 'epoch': 17.12}
{'loss': 0.6447, 'grad_norm': 3.778926372528076, 'learning_rate': 7.5e-05, 'epoch': 17.14}
{'loss': 0.5865, 'grad_norm': 1.6

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6632013320922852, 'eval_runtime': 48.4309, 'eval_samples_per_second': 49.535, 'eval_steps_per_second': 3.097, 'epoch': 18.0}
{'loss': 0.6484, 'grad_norm': 9.193716049194336, 'learning_rate': 5.1886792452830194e-05, 'epoch': 18.02}
{'loss': 0.6801, 'grad_norm': 3.667271852493286, 'learning_rate': 5.141509433962264e-05, 'epoch': 18.04}
{'loss': 0.5077, 'grad_norm': 1.5505574941635132, 'learning_rate': 5.09433962264151e-05, 'epoch': 18.05}
{'loss': 0.5839, 'grad_norm': 2.489182472229004, 'learning_rate': 5.0471698113207554e-05, 'epoch': 18.07}
{'loss': 0.6127, 'grad_norm': 7.445060729980469, 'learning_rate': 5e-05, 'epoch': 18.09}
{'loss': 0.7014, 'grad_norm': 6.374261379241943, 'learning_rate': 4.952830188679246e-05, 'epoch': 18.11}
{'loss': 0.5328, 'grad_norm': 5.206268787384033, 'learning_rate': 4.9056603773584906e-05, 'epoch': 18.13}
{'loss': 0.5346, 'grad_norm': 2.7038941383361816, 'learning_rate': 4.858490566037736e-05, 'epoch': 18.14}
{'loss': 0.5911, 'grad_norm': 3

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6655173897743225, 'eval_runtime': 48.5798, 'eval_samples_per_second': 49.383, 'eval_steps_per_second': 3.088, 'epoch': 19.0}
{'loss': 0.6335, 'grad_norm': 3.5312161445617676, 'learning_rate': 2.5943396226415097e-05, 'epoch': 19.01}
{'loss': 0.5119, 'grad_norm': 2.6753220558166504, 'learning_rate': 2.547169811320755e-05, 'epoch': 19.03}
{'loss': 0.6289, 'grad_norm': 3.473750114440918, 'learning_rate': 2.5e-05, 'epoch': 19.05}
{'loss': 0.6489, 'grad_norm': 4.867116928100586, 'learning_rate': 2.4528301886792453e-05, 'epoch': 19.06}
{'loss': 0.628, 'grad_norm': 3.2044360637664795, 'learning_rate': 2.405660377358491e-05, 'epoch': 19.08}
{'loss': 0.6691, 'grad_norm': 4.901530742645264, 'learning_rate': 2.358490566037736e-05, 'epoch': 19.1}
{'loss': 0.5194, 'grad_norm': 6.225391387939453, 'learning_rate': 2.311320754716981e-05, 'epoch': 19.12}
{'loss': 0.6129, 'grad_norm': 3.4987285137176514, 'learning_rate': 2.264150943396226e-05, 'epoch': 19.14}
{'loss': 0.6169, 'grad_norm':

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6640679836273193, 'eval_runtime': 48.2959, 'eval_samples_per_second': 49.673, 'eval_steps_per_second': 3.106, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.La

{'train_runtime': 5597.4643, 'train_samples_per_second': 31.682, 'train_steps_per_second': 1.983, 'train_loss': 0.6275923202703665, 'epoch': 20.0}


  0%|          | 0/150 [00:00<?, ?it/s]

Saved metrics for severity_classification to results/cyberbert
CyberBERT: Acc=0.674, Balanced Acc=0.625, F1=0.675
AUROC Weighted=0.771
              precision    recall  f1-score   support

        mild       0.75      0.75      0.75      1399
    moderate       0.57      0.58      0.57       864
      severe       0.58      0.55      0.56       136

    accuracy                           0.67      2399
   macro avg       0.63      0.63      0.63      2399
weighted avg       0.67      0.67      0.67      2399



In [21]:
print("\nROLE CLASSIFICATION (ALL COMMENTS):")
print("=" * 40)

role_names = ['bully', 'bully_assistant', 'aggressive_victim', 'aggressive_defender',
              'passive_bystander', 'non_aggressive_victim', 'non_aggressive_defender']

train_texts_role = text_data[train_idx]
test_texts_role = text_data[test_idx]
train_labels_role = role_labels[train_idx]
test_labels_role = role_labels[test_idx]

pred_r, pred_probs_r = train_transformer_model(
    'unitary/toxic-bert',
    BertTokenizer,
    BertForSequenceClassification,
    train_texts_role,
    train_labels_role,
    test_texts_role,
    test_labels_role,
    num_labels=7,
    task_name='role_classification'
)

metrics_r = save_metrics(
    test_labels_role, pred_r, pred_probs_r,
    role_names,
    'role_classification'
)

print(f"CyberBERT: Acc={metrics_r['accuracy']:.3f}, Balanced Acc={metrics_r['balanced_accuracy']:.3f}, F1={metrics_r['f1_weighted']:.3f}")
if metrics_r['auroc_weighted']:
    print(f"AUROC Weighted={metrics_r['auroc_weighted']:.3f}")
print(classification_report(test_labels_role, pred_r, target_names=role_names, zero_division=0))


ROLE CLASSIFICATION (ALL COMMENTS):


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unitary/toxic-bert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/106620 [00:00<?, ?it/s]

{'loss': 1.6246, 'grad_norm': 15.7592134475708, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 1.4846, 'grad_norm': 13.806464195251465, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 1.214, 'grad_norm': 8.679006576538086, 'learning_rate': 3e-05, 'epoch': 0.01}
{'loss': 0.8746, 'grad_norm': 9.061903953552246, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 0.6804, 'grad_norm': 5.03314208984375, 'learning_rate': 5e-05, 'epoch': 0.01}
{'loss': 0.5577, 'grad_norm': 2.5873780250549316, 'learning_rate': 6e-05, 'epoch': 0.01}
{'loss': 0.4234, 'grad_norm': 4.096367359161377, 'learning_rate': 7.000000000000001e-05, 'epoch': 0.01}
{'loss': 0.3533, 'grad_norm': 1.251061201095581, 'learning_rate': 8e-05, 'epoch': 0.02}
{'loss': 0.3723, 'grad_norm': 2.5483028888702393, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.02}
{'loss': 0.3974, 'grad_norm': 0.6858304142951965, 'learning_rate': 0.0001, 'epoch': 0.02}
{'loss': 0.4349, 'grad_norm': 1.4728114604949951, 'learning_rate': 0.00011, 'epoch': 0.0

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3568231761455536, 'eval_runtime': 430.7098, 'eval_samples_per_second': 49.514, 'eval_steps_per_second': 3.095, 'epoch': 1.0}
{'loss': 0.358, 'grad_norm': 2.3611693382263184, 'learning_rate': 0.00047719562759140596, 'epoch': 1.0}
{'loss': 0.3326, 'grad_norm': 3.6011769771575928, 'learning_rate': 0.0004771485111194874, 'epoch': 1.0}
{'loss': 0.3804, 'grad_norm': 1.6933882236480713, 'learning_rate': 0.0004771013946475688, 'epoch': 1.01}
{'loss': 0.3392, 'grad_norm': 3.128005266189575, 'learning_rate': 0.00047705427817565024, 'epoch': 1.01}
{'loss': 0.3812, 'grad_norm': 2.344358205795288, 'learning_rate': 0.00047700716170373167, 'epoch': 1.01}
{'loss': 0.2564, 'grad_norm': 2.0841519832611084, 'learning_rate': 0.0004769600452318131, 'epoch': 1.01}
{'loss': 0.3318, 'grad_norm': 0.5290514230728149, 'learning_rate': 0.0004769129287598945, 'epoch': 1.01}
{'loss': 0.2107, 'grad_norm': 2.958209753036499, 'learning_rate': 0.00047686581228797585, 'epoch': 1.01}
{'loss': 0.5127, 'gra

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.38877880573272705, 'eval_runtime': 430.7781, 'eval_samples_per_second': 49.506, 'eval_steps_per_second': 3.094, 'epoch': 2.0}
{'loss': 0.3271, 'grad_norm': 2.007652759552002, 'learning_rate': 0.0004520825480588014, 'epoch': 2.0}
{'loss': 0.4427, 'grad_norm': 2.1698098182678223, 'learning_rate': 0.00045203543158688276, 'epoch': 2.0}
{'loss': 0.3123, 'grad_norm': 2.202802896499634, 'learning_rate': 0.0004519883151149642, 'epoch': 2.01}
{'loss': 0.3827, 'grad_norm': 2.121225595474243, 'learning_rate': 0.0004519411986430456, 'epoch': 2.01}
{'loss': 0.435, 'grad_norm': 2.4775586128234863, 'learning_rate': 0.00045189408217112705, 'epoch': 2.01}
{'loss': 0.242, 'grad_norm': 1.7693098783493042, 'learning_rate': 0.0004518469656992085, 'epoch': 2.01}
{'loss': 0.3974, 'grad_norm': 3.8688132762908936, 'learning_rate': 0.0004517998492272899, 'epoch': 2.01}
{'loss': 0.2323, 'grad_norm': 2.2526001930236816, 'learning_rate': 0.0004517527327553713, 'epoch': 2.01}
{'loss': 0.3594, 'grad_

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.35375329852104187, 'eval_runtime': 431.7494, 'eval_samples_per_second': 49.394, 'eval_steps_per_second': 3.087, 'epoch': 3.0}
{'loss': 0.3013, 'grad_norm': 3.4547128677368164, 'learning_rate': 0.00042696946852619676, 'epoch': 3.0}
{'loss': 0.3226, 'grad_norm': 0.9359515309333801, 'learning_rate': 0.0004269223520542782, 'epoch': 3.0}
{'loss': 0.406, 'grad_norm': 1.3343344926834106, 'learning_rate': 0.00042687523558235956, 'epoch': 3.01}
{'loss': 0.2803, 'grad_norm': 5.243098735809326, 'learning_rate': 0.000426828119110441, 'epoch': 3.01}
{'loss': 0.2847, 'grad_norm': 1.5176975727081299, 'learning_rate': 0.0004267810026385224, 'epoch': 3.01}
{'loss': 0.3256, 'grad_norm': 2.1864612102508545, 'learning_rate': 0.00042673388616660385, 'epoch': 3.01}
{'loss': 0.3605, 'grad_norm': 5.873013973236084, 'learning_rate': 0.0004266867696946853, 'epoch': 3.01}
{'loss': 0.4813, 'grad_norm': 3.306839942932129, 'learning_rate': 0.0004266396532227667, 'epoch': 3.01}
{'loss': 0.2613, 'grad

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.39628711342811584, 'eval_runtime': 428.4452, 'eval_samples_per_second': 49.775, 'eval_steps_per_second': 3.111, 'epoch': 4.0}
{'loss': 0.3354, 'grad_norm': 3.016019105911255, 'learning_rate': 0.0004018563889935922, 'epoch': 4.0}
{'loss': 0.3519, 'grad_norm': 0.7833307981491089, 'learning_rate': 0.0004018092725216736, 'epoch': 4.0}
{'loss': 0.3585, 'grad_norm': 1.859632968902588, 'learning_rate': 0.00040176215604975505, 'epoch': 4.0}
{'loss': 0.3077, 'grad_norm': 0.5355777740478516, 'learning_rate': 0.00040171503957783637, 'epoch': 4.01}
{'loss': 0.3068, 'grad_norm': 4.0245208740234375, 'learning_rate': 0.0004016679231059178, 'epoch': 4.01}
{'loss': 0.4572, 'grad_norm': 2.565375328063965, 'learning_rate': 0.0004016208066339992, 'epoch': 4.01}
{'loss': 0.2054, 'grad_norm': 1.4712496995925903, 'learning_rate': 0.00040157369016208066, 'epoch': 4.01}
{'loss': 0.3281, 'grad_norm': 3.2532401084899902, 'learning_rate': 0.0004015265736901621, 'epoch': 4.01}
{'loss': 0.2469, 'gra

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.37324225902557373, 'eval_runtime': 431.1058, 'eval_samples_per_second': 49.468, 'eval_steps_per_second': 3.092, 'epoch': 5.0}
{'loss': 0.32, 'grad_norm': 2.287553310394287, 'learning_rate': 0.00037674330946098756, 'epoch': 5.0}
{'loss': 0.4816, 'grad_norm': 2.6417596340179443, 'learning_rate': 0.000376696192989069, 'epoch': 5.0}
{'loss': 0.4247, 'grad_norm': 3.1371941566467285, 'learning_rate': 0.0003766490765171504, 'epoch': 5.0}
{'loss': 0.3971, 'grad_norm': 2.4077115058898926, 'learning_rate': 0.00037660196004523185, 'epoch': 5.01}
{'loss': 0.4009, 'grad_norm': 4.109123229980469, 'learning_rate': 0.0003765548435733132, 'epoch': 5.01}
{'loss': 0.3176, 'grad_norm': 2.3788211345672607, 'learning_rate': 0.00037650772710139466, 'epoch': 5.01}
{'loss': 0.4018, 'grad_norm': 2.65596079826355, 'learning_rate': 0.0003764606106294761, 'epoch': 5.01}
{'loss': 0.4626, 'grad_norm': 6.0180277824401855, 'learning_rate': 0.0003764134941575575, 'epoch': 5.01}
{'loss': 0.2877, 'grad_no

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.36530742049217224, 'eval_runtime': 431.0211, 'eval_samples_per_second': 49.478, 'eval_steps_per_second': 3.093, 'epoch': 6.0}
{'loss': 0.4023, 'grad_norm': 1.3877595663070679, 'learning_rate': 0.00035163022992838294, 'epoch': 6.0}
{'loss': 0.3542, 'grad_norm': 3.906162738800049, 'learning_rate': 0.00035158311345646437, 'epoch': 6.0}
{'loss': 0.2677, 'grad_norm': 1.4795669317245483, 'learning_rate': 0.0003515359969845458, 'epoch': 6.0}
{'loss': 0.3906, 'grad_norm': 2.3784573078155518, 'learning_rate': 0.0003514888805126272, 'epoch': 6.01}
{'loss': 0.3208, 'grad_norm': 5.895310878753662, 'learning_rate': 0.00035144176404070866, 'epoch': 6.01}
{'loss': 0.4592, 'grad_norm': 0.5470983386039734, 'learning_rate': 0.0003513946475687901, 'epoch': 6.01}
{'loss': 0.3824, 'grad_norm': 2.188427209854126, 'learning_rate': 0.00035134753109687146, 'epoch': 6.01}
{'loss': 0.3557, 'grad_norm': 4.151999473571777, 'learning_rate': 0.0003513004146249529, 'epoch': 6.01}
{'loss': 0.2244, 'gra

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.35035109519958496, 'eval_runtime': 431.0058, 'eval_samples_per_second': 49.48, 'eval_steps_per_second': 3.093, 'epoch': 7.0}
{'loss': 0.3827, 'grad_norm': 4.896892547607422, 'learning_rate': 0.00032651715039577837, 'epoch': 7.0}
{'loss': 0.3839, 'grad_norm': 2.489243268966675, 'learning_rate': 0.00032647003392385974, 'epoch': 7.0}
{'loss': 0.3303, 'grad_norm': 3.6852259635925293, 'learning_rate': 0.00032642291745194117, 'epoch': 7.0}
{'loss': 0.3527, 'grad_norm': 1.7094316482543945, 'learning_rate': 0.0003263758009800226, 'epoch': 7.01}
{'loss': 0.2622, 'grad_norm': 2.524946928024292, 'learning_rate': 0.00032632868450810403, 'epoch': 7.01}
{'loss': 0.3237, 'grad_norm': 3.291386604309082, 'learning_rate': 0.00032628156803618546, 'epoch': 7.01}
{'loss': 0.3947, 'grad_norm': 2.0244715213775635, 'learning_rate': 0.0003262344515642669, 'epoch': 7.01}
{'loss': 0.3999, 'grad_norm': 5.133685111999512, 'learning_rate': 0.00032618733509234826, 'epoch': 7.01}
{'loss': 0.3869, 'gra

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3651374876499176, 'eval_runtime': 430.1004, 'eval_samples_per_second': 49.584, 'eval_steps_per_second': 3.099, 'epoch': 8.0}
{'loss': 0.4007, 'grad_norm': 1.850498914718628, 'learning_rate': 0.0003014040708631738, 'epoch': 8.0}
{'loss': 0.3385, 'grad_norm': 2.0136055946350098, 'learning_rate': 0.00030135695439125523, 'epoch': 8.0}
{'loss': 0.3726, 'grad_norm': 2.4639291763305664, 'learning_rate': 0.0003013098379193366, 'epoch': 8.0}
{'loss': 0.2376, 'grad_norm': 3.690920829772949, 'learning_rate': 0.00030126272144741803, 'epoch': 8.01}
{'loss': 0.3013, 'grad_norm': 1.7921968698501587, 'learning_rate': 0.00030121560497549946, 'epoch': 8.01}
{'loss': 0.2328, 'grad_norm': 3.2134573459625244, 'learning_rate': 0.00030116848850358084, 'epoch': 8.01}
{'loss': 0.3232, 'grad_norm': 3.0330371856689453, 'learning_rate': 0.00030112137203166226, 'epoch': 8.01}
{'loss': 0.2799, 'grad_norm': 3.0984299182891846, 'learning_rate': 0.0003010742555597437, 'epoch': 8.01}
{'loss': 0.2041, 'g

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3456941246986389, 'eval_runtime': 429.8476, 'eval_samples_per_second': 49.613, 'eval_steps_per_second': 3.101, 'epoch': 9.0}
{'loss': 0.259, 'grad_norm': 2.2170519828796387, 'learning_rate': 0.0002762909913305692, 'epoch': 9.0}
{'loss': 0.2869, 'grad_norm': 3.528221368789673, 'learning_rate': 0.0002762438748586506, 'epoch': 9.0}
{'loss': 0.3487, 'grad_norm': 2.782627582550049, 'learning_rate': 0.00027619675838673203, 'epoch': 9.0}
{'loss': 0.2631, 'grad_norm': 2.8653151988983154, 'learning_rate': 0.0002761496419148134, 'epoch': 9.01}
{'loss': 0.2575, 'grad_norm': 3.5601871013641357, 'learning_rate': 0.00027610252544289484, 'epoch': 9.01}
{'loss': 0.3526, 'grad_norm': 2.7636940479278564, 'learning_rate': 0.00027605540897097626, 'epoch': 9.01}
{'loss': 0.2418, 'grad_norm': 3.0733351707458496, 'learning_rate': 0.0002760082924990577, 'epoch': 9.01}
{'loss': 0.3298, 'grad_norm': 1.945475459098816, 'learning_rate': 0.0002759611760271391, 'epoch': 9.01}
{'loss': 0.3688, 'grad_

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.367055207490921, 'eval_runtime': 432.2268, 'eval_samples_per_second': 49.34, 'eval_steps_per_second': 3.084, 'epoch': 10.0}
{'loss': 0.3599, 'grad_norm': 2.3687350749969482, 'learning_rate': 0.000251130795326046, 'epoch': 10.0}
{'loss': 0.3188, 'grad_norm': 2.138975143432617, 'learning_rate': 0.0002510836788541274, 'epoch': 10.0}
{'loss': 0.4817, 'grad_norm': 1.878807544708252, 'learning_rate': 0.00025103656238220884, 'epoch': 10.01}
{'loss': 0.3888, 'grad_norm': 2.4433271884918213, 'learning_rate': 0.0002509894459102902, 'epoch': 10.01}
{'loss': 0.364, 'grad_norm': 1.0471742153167725, 'learning_rate': 0.00025094232943837164, 'epoch': 10.01}
{'loss': 0.5439, 'grad_norm': 3.0134384632110596, 'learning_rate': 0.00025089521296645307, 'epoch': 10.01}
{'loss': 0.2783, 'grad_norm': 5.403639316558838, 'learning_rate': 0.0002508480964945345, 'epoch': 10.01}
{'loss': 0.3135, 'grad_norm': 1.8273563385009766, 'learning_rate': 0.00025080098002261593, 'epoch': 10.02}
{'loss': 0.3287

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.36043766140937805, 'eval_runtime': 430.6469, 'eval_samples_per_second': 49.521, 'eval_steps_per_second': 3.095, 'epoch': 11.0}
{'loss': 0.3671, 'grad_norm': 3.9911680221557617, 'learning_rate': 0.00022601771579344138, 'epoch': 11.0}
{'loss': 0.3357, 'grad_norm': 3.2957730293273926, 'learning_rate': 0.0002259705993215228, 'epoch': 11.0}
{'loss': 0.3312, 'grad_norm': 2.688488721847534, 'learning_rate': 0.00022592348284960424, 'epoch': 11.01}
{'loss': 0.3547, 'grad_norm': 2.4101626873016357, 'learning_rate': 0.00022587636637768564, 'epoch': 11.01}
{'loss': 0.2382, 'grad_norm': 2.282548427581787, 'learning_rate': 0.00022582924990576707, 'epoch': 11.01}
{'loss': 0.3989, 'grad_norm': 1.5383338928222656, 'learning_rate': 0.0002257821334338485, 'epoch': 11.01}
{'loss': 0.3581, 'grad_norm': 4.720397472381592, 'learning_rate': 0.00022573501696192987, 'epoch': 11.01}
{'loss': 0.3207, 'grad_norm': 3.090708017349243, 'learning_rate': 0.0002256879004900113, 'epoch': 11.01}
{'loss': 0

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3389243483543396, 'eval_runtime': 429.2554, 'eval_samples_per_second': 49.681, 'eval_steps_per_second': 3.105, 'epoch': 12.0}
{'loss': 0.2871, 'grad_norm': 1.8307822942733765, 'learning_rate': 0.0002009046362608368, 'epoch': 12.0}
{'loss': 0.2306, 'grad_norm': 0.6077690124511719, 'learning_rate': 0.00020085751978891818, 'epoch': 12.0}
{'loss': 0.4031, 'grad_norm': 7.171529293060303, 'learning_rate': 0.0002008104033169996, 'epoch': 12.01}
{'loss': 0.4438, 'grad_norm': 3.6003124713897705, 'learning_rate': 0.00020076328684508104, 'epoch': 12.01}
{'loss': 0.3724, 'grad_norm': 1.7340552806854248, 'learning_rate': 0.00020071617037316244, 'epoch': 12.01}
{'loss': 0.3777, 'grad_norm': 2.6281228065490723, 'learning_rate': 0.00020066905390124387, 'epoch': 12.01}
{'loss': 0.4615, 'grad_norm': 2.2564220428466797, 'learning_rate': 0.0002006219374293253, 'epoch': 12.01}
{'loss': 0.2557, 'grad_norm': 3.0380427837371826, 'learning_rate': 0.0002005748209574067, 'epoch': 12.01}
{'loss': 

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3673405349254608, 'eval_runtime': 421.505, 'eval_samples_per_second': 50.595, 'eval_steps_per_second': 3.162, 'epoch': 13.0}
{'loss': 0.4586, 'grad_norm': 3.1037192344665527, 'learning_rate': 0.00017579155672823218, 'epoch': 13.0}
{'loss': 0.3168, 'grad_norm': 3.8351259231567383, 'learning_rate': 0.0001757444402563136, 'epoch': 13.0}
{'loss': 0.267, 'grad_norm': 1.642316460609436, 'learning_rate': 0.00017569732378439504, 'epoch': 13.01}
{'loss': 0.3773, 'grad_norm': 3.4113035202026367, 'learning_rate': 0.00017565020731247644, 'epoch': 13.01}
{'loss': 0.2741, 'grad_norm': 1.9864872694015503, 'learning_rate': 0.00017560309084055787, 'epoch': 13.01}
{'loss': 0.399, 'grad_norm': 2.830876588821411, 'learning_rate': 0.0001755559743686393, 'epoch': 13.01}
{'loss': 0.4279, 'grad_norm': 4.344967842102051, 'learning_rate': 0.0001755088578967207, 'epoch': 13.01}
{'loss': 0.3412, 'grad_norm': 1.8435864448547363, 'learning_rate': 0.0001754617414248021, 'epoch': 13.01}
{'loss': 0.401

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3442499339580536, 'eval_runtime': 431.5255, 'eval_samples_per_second': 49.42, 'eval_steps_per_second': 3.089, 'epoch': 14.0}
{'loss': 0.3721, 'grad_norm': 2.4685873985290527, 'learning_rate': 0.00015067847719562761, 'epoch': 14.0}
{'loss': 0.4919, 'grad_norm': 2.094200849533081, 'learning_rate': 0.00015063136072370902, 'epoch': 14.0}
{'loss': 0.1954, 'grad_norm': 1.8080883026123047, 'learning_rate': 0.00015058424425179042, 'epoch': 14.0}
{'loss': 0.3753, 'grad_norm': 3.4715635776519775, 'learning_rate': 0.00015053712777987185, 'epoch': 14.01}
{'loss': 0.3113, 'grad_norm': 4.131108283996582, 'learning_rate': 0.00015049001130795325, 'epoch': 14.01}
{'loss': 0.2899, 'grad_norm': 5.926482677459717, 'learning_rate': 0.00015044289483603468, 'epoch': 14.01}
{'loss': 0.191, 'grad_norm': 1.3972779512405396, 'learning_rate': 0.0001503957783641161, 'epoch': 14.01}
{'loss': 0.3525, 'grad_norm': 1.0123510360717773, 'learning_rate': 0.0001503486618921975, 'epoch': 14.01}
{'loss': 0.2

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3322095274925232, 'eval_runtime': 433.2958, 'eval_samples_per_second': 49.218, 'eval_steps_per_second': 3.076, 'epoch': 15.0}
{'loss': 0.3308, 'grad_norm': 2.470675468444824, 'learning_rate': 0.000125565397663023, 'epoch': 15.0}
{'loss': 0.204, 'grad_norm': 2.2089667320251465, 'learning_rate': 0.00012551828119110442, 'epoch': 15.0}
{'loss': 0.3591, 'grad_norm': 3.38254714012146, 'learning_rate': 0.00012547116471918582, 'epoch': 15.0}
{'loss': 0.2602, 'grad_norm': 2.0738635063171387, 'learning_rate': 0.00012542404824726725, 'epoch': 15.01}
{'loss': 0.2682, 'grad_norm': 3.756958484649658, 'learning_rate': 0.00012537693177534868, 'epoch': 15.01}
{'loss': 0.3015, 'grad_norm': 1.6467183828353882, 'learning_rate': 0.00012532981530343008, 'epoch': 15.01}
{'loss': 0.382, 'grad_norm': 3.5737099647521973, 'learning_rate': 0.0001252826988315115, 'epoch': 15.01}
{'loss': 0.5376, 'grad_norm': 3.6804490089416504, 'learning_rate': 0.00012523558235959294, 'epoch': 15.01}
{'loss': 0.186

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.33571526408195496, 'eval_runtime': 430.5812, 'eval_samples_per_second': 49.528, 'eval_steps_per_second': 3.096, 'epoch': 16.0}
{'loss': 0.3296, 'grad_norm': 2.222827196121216, 'learning_rate': 0.0001004523181304184, 'epoch': 16.0}
{'loss': 0.2627, 'grad_norm': 1.546484351158142, 'learning_rate': 0.0001004052016584998, 'epoch': 16.0}
{'loss': 0.4102, 'grad_norm': 3.232943058013916, 'learning_rate': 0.00010035808518658122, 'epoch': 16.0}
{'loss': 0.2719, 'grad_norm': 2.3750052452087402, 'learning_rate': 0.00010031096871466265, 'epoch': 16.01}
{'loss': 0.2231, 'grad_norm': 2.7623777389526367, 'learning_rate': 0.00010026385224274407, 'epoch': 16.01}
{'loss': 0.3029, 'grad_norm': 4.230029106140137, 'learning_rate': 0.00010021673577082548, 'epoch': 16.01}
{'loss': 0.3704, 'grad_norm': 1.5643435716629028, 'learning_rate': 0.0001001696192989069, 'epoch': 16.01}
{'loss': 0.3196, 'grad_norm': 1.5050326585769653, 'learning_rate': 0.00010012250282698831, 'epoch': 16.01}
{'loss': 0.

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.34006017446517944, 'eval_runtime': 429.1655, 'eval_samples_per_second': 49.692, 'eval_steps_per_second': 3.106, 'epoch': 17.0}
{'loss': 0.248, 'grad_norm': 3.0472586154937744, 'learning_rate': 7.533923859781381e-05, 'epoch': 17.0}
{'loss': 0.3909, 'grad_norm': 5.653448581695557, 'learning_rate': 7.529212212589521e-05, 'epoch': 17.0}
{'loss': 0.3192, 'grad_norm': 2.045130968093872, 'learning_rate': 7.524500565397662e-05, 'epoch': 17.0}
{'loss': 0.2883, 'grad_norm': 2.426931858062744, 'learning_rate': 7.519788918205805e-05, 'epoch': 17.01}
{'loss': 0.4652, 'grad_norm': 3.7225427627563477, 'learning_rate': 7.515077271013947e-05, 'epoch': 17.01}
{'loss': 0.338, 'grad_norm': 2.353562831878662, 'learning_rate': 7.510365623822088e-05, 'epoch': 17.01}
{'loss': 0.2185, 'grad_norm': 1.2346296310424805, 'learning_rate': 7.50565397663023e-05, 'epoch': 17.01}
{'loss': 0.2436, 'grad_norm': 1.1591914892196655, 'learning_rate': 7.500942329438372e-05, 'epoch': 17.01}
{'loss': 0.2802, 'g

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.3342161774635315, 'eval_runtime': 430.7705, 'eval_samples_per_second': 49.507, 'eval_steps_per_second': 3.094, 'epoch': 18.0}
{'loss': 0.3482, 'grad_norm': 1.6119203567504883, 'learning_rate': 5.02261590652092e-05, 'epoch': 18.0}
{'loss': 0.3937, 'grad_norm': 2.2098424434661865, 'learning_rate': 5.017904259329061e-05, 'epoch': 18.0}
{'loss': 0.3242, 'grad_norm': 1.0244604349136353, 'learning_rate': 5.0131926121372033e-05, 'epoch': 18.0}
{'loss': 0.3435, 'grad_norm': 4.24176025390625, 'learning_rate': 5.008480964945345e-05, 'epoch': 18.01}
{'loss': 0.2908, 'grad_norm': 3.2679553031921387, 'learning_rate': 5.0037693177534865e-05, 'epoch': 18.01}
{'loss': 0.2362, 'grad_norm': 1.186830997467041, 'learning_rate': 4.999057670561629e-05, 'epoch': 18.01}
{'loss': 0.2876, 'grad_norm': 2.1998798847198486, 'learning_rate': 4.99434602336977e-05, 'epoch': 18.01}
{'loss': 0.3264, 'grad_norm': 1.1382472515106201, 'learning_rate': 4.989634376177912e-05, 'epoch': 18.01}
{'loss': 0.3464,

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.32862788438796997, 'eval_runtime': 429.2397, 'eval_samples_per_second': 49.683, 'eval_steps_per_second': 3.105, 'epoch': 19.0}
{'loss': 0.4218, 'grad_norm': 3.9452998638153076, 'learning_rate': 2.51130795326046e-05, 'epoch': 19.0}
{'loss': 0.4749, 'grad_norm': 3.245302438735962, 'learning_rate': 2.5065963060686017e-05, 'epoch': 19.0}
{'loss': 0.2728, 'grad_norm': 3.0066728591918945, 'learning_rate': 2.5018846588767432e-05, 'epoch': 19.0}
{'loss': 0.2274, 'grad_norm': 0.843357264995575, 'learning_rate': 2.497173011684885e-05, 'epoch': 19.01}
{'loss': 0.3267, 'grad_norm': 1.678183674812317, 'learning_rate': 2.4924613644930267e-05, 'epoch': 19.01}
{'loss': 0.2885, 'grad_norm': 2.070295810699463, 'learning_rate': 2.4877497173011682e-05, 'epoch': 19.01}
{'loss': 0.2107, 'grad_norm': 1.2405740022659302, 'learning_rate': 2.4830380701093105e-05, 'epoch': 19.01}
{'loss': 0.358, 'grad_norm': 1.9432109594345093, 'learning_rate': 2.478326422917452e-05, 'epoch': 19.01}
{'loss': 0.24

  0%|          | 0/1333 [00:00<?, ?it/s]

{'eval_loss': 0.32767337560653687, 'eval_runtime': 429.0304, 'eval_samples_per_second': 49.707, 'eval_steps_per_second': 3.107, 'epoch': 20.0}
{'train_runtime': 52901.455, 'train_samples_per_second': 32.246, 'train_steps_per_second': 2.015, 'train_loss': 0.33238066860623666, 'epoch': 20.0}


  0%|          | 0/1333 [00:00<?, ?it/s]

Saved metrics for role_classification to results/cyberbert
CyberBERT: Acc=0.899, Balanced Acc=0.220, F1=0.881
AUROC Weighted=0.921
                         precision    recall  f1-score   support

                  bully       0.63      0.54      0.58      2036
        bully_assistant       0.00      0.00      0.00        39
      aggressive_victim       0.00      0.00      0.00        74
    aggressive_defender       0.08      0.00      0.01       250
      passive_bystander       0.93      0.98      0.95     18512
  non_aggressive_victim       0.00      0.00      0.00       256
non_aggressive_defender       0.06      0.01      0.02       159

               accuracy                           0.90     21326
              macro avg       0.24      0.22      0.22     21326
           weighted avg       0.86      0.90      0.88     21326



In [None]:
class MultiLabelTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        text = text.strip() if text and text != 'nan' else ""
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class MultiLabelTransformer(nn.Module):
    def __init__(self, model_name, num_labels, model_class):
        super().__init__()
        self.transformer = model_class.from_pretrained(
            model_name, 
            num_labels=num_labels,
            ignore_mismatched_sizes=True,
            problem_type="multi_label_classification"
        )
        
        for name, param in self.transformer.named_parameters():
            if 'classifier' not in name and 'pooler' not in name:
                param.requires_grad = False
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = self.sigmoid(logits)
        
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())
            return {'loss': loss, 'logits': logits}
        
        return {'logits': logits}

def train_multilabel_transformer(model_name, tokenizer_class, model_class, train_texts, train_labels, test_texts, test_labels, num_labels):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = MultiLabelTransformer(model_name, num_labels, model_class).to(device)
    
    train_dataset = MultiLabelTextDataset(train_texts, train_labels, tokenizer)
    test_dataset = MultiLabelTextDataset(test_texts, test_labels, tokenizer)
    
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=500,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=False,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None,
        save_total_limit=1,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    
    print("Training multi-label model...")
    trainer.train()
    
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.sigmoid(torch.tensor(predictions.predictions))
    pred_labels = (pred_probs > 0.5).numpy().astype(int)
    
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return pred_labels, pred_probs.numpy()

def save_multilabel_metrics(y_true, y_pred, y_proba, topic_names, task_name, model_name):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    topic_metrics = []
    balanced_accuracies = []
    
    for i, topic_name in enumerate(topic_names):
        p, r, f1, _ = precision_recall_fscore_support(
            y_true[:, i], y_pred[:, i], average='binary', zero_division=0
        )
        support = y_true[:, i].sum()
        
        balanced_acc = balanced_accuracy_score(y_true[:, i], y_pred[:, i])
        balanced_accuracies.append(balanced_acc)
        
        try:
            auroc = roc_auc_score(y_true[:, i], y_proba[:, i])
        except:
            auroc = None
        
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
        plt.title(f'{model_name.upper()} - {topic_name} Topic - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{results_dir}/{task_name}_{topic_name}_confusion_matrix_{timestamp}_fold_{fold_num}.png', 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        np.save(f'{results_dir}/{task_name}_{topic_name}_confusion_matrix_{timestamp}_fold_{fold_num}.npy', cm)
            
        topic_metrics.append({
            'topic': topic_name,
            'precision': p,
            'recall': r,
            'f1': f1,
            'balanced_accuracy': balanced_acc,
            'auroc': auroc,
            'support': support
        })
    
    subset_accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    
    precisions = [m['precision'] for m in topic_metrics]
    recalls = [m['recall'] for m in topic_metrics]
    f1s = [m['f1'] for m in topic_metrics]
    aurocs = [m['auroc'] for m in topic_metrics if m['auroc'] is not None]
    
    metrics['subset_accuracy'] = subset_accuracy
    metrics['macro_precision'] = np.mean(precisions)
    metrics['macro_recall'] = np.mean(recalls)
    metrics['macro_f1'] = np.mean(f1s)
    metrics['macro_balanced_accuracy'] = np.mean(balanced_accuracies)
    if aurocs:
        metrics['macro_auroc'] = np.mean(aurocs)
    
    metrics['per_topic_metrics'] = topic_metrics
    
    with open(f'{results_dir}/{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2, default=str)
    
    print(f"Saved multi-label metrics for {task_name} to {results_dir}")
    
    return metrics

In [23]:
print("\nTOPIC CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):")
print("=" * 60)

cb_mask = target == 1
topic_names = ['disability', 'gender', 'intellectual', 'other', 'physical', 
               'political', 'race', 'religious', 'sexual', 'social_status']

train_texts_topic = text_data[train_idx & cb_mask]
test_texts_topic = text_data[test_idx & cb_mask]
train_labels_topic = topic_labels[train_idx & cb_mask].values
test_labels_topic = topic_labels[test_idx & cb_mask].values

if len(train_labels_topic) > 0 and len(test_labels_topic) > 0:
    pred_t, pred_probs_t = train_multilabel_transformer(
        'unitary/toxic-bert',
        BertTokenizer,
        BertForSequenceClassification,
        train_texts_topic,
        train_labels_topic,
        test_texts_topic,
        test_labels_topic,
        num_labels=10
    )
    
    metrics_t = save_multilabel_metrics(
        test_labels_topic, pred_t, pred_probs_t,
        topic_names,
        'topic_classification',
        'cyberbert'
    )
    
    print(f"CyberBERT: Subset Acc={metrics_t['subset_accuracy']:.3f}, Macro F1={metrics_t['macro_f1']:.3f}, Macro Balanced Acc={metrics_t['macro_balanced_accuracy']:.3f}")
    if 'macro_auroc' in metrics_t:
        print(f"Macro AUROC={metrics_t['macro_auroc']:.3f}")
    
    print("\nPer-topic results:")
    for metric in metrics_t['per_topic_metrics']:
        print(f"{metric['topic']}: P={metric['precision']:.3f}, R={metric['recall']:.3f}, F1={metric['f1']:.3f}, Balanced Acc={metric['balanced_accuracy']:.3f}, Support={metric['support']}")
else:
    print("CyberBERT: No data available")


TOPIC CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unitary/toxic-bert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training multi-label model...


  0%|          | 0/11100 [00:00<?, ?it/s]

{'loss': 0.2727, 'grad_norm': 0.5264139175415039, 'learning_rate': 0.0005, 'epoch': 0.9}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.2134772092103958, 'eval_runtime': 48.2959, 'eval_samples_per_second': 49.673, 'eval_steps_per_second': 3.106, 'epoch': 1.0}
{'loss': 0.2037, 'grad_norm': 0.2907367944717407, 'learning_rate': 0.00047641509433962265, 'epoch': 1.8}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19436638057231903, 'eval_runtime': 48.346, 'eval_samples_per_second': 49.621, 'eval_steps_per_second': 3.103, 'epoch': 2.0}
{'loss': 0.1948, 'grad_norm': 0.40883904695510864, 'learning_rate': 0.0004528301886792453, 'epoch': 2.7}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18988564610481262, 'eval_runtime': 48.1234, 'eval_samples_per_second': 49.851, 'eval_steps_per_second': 3.117, 'epoch': 3.0}
{'loss': 0.1888, 'grad_norm': 0.5399391055107117, 'learning_rate': 0.00042924528301886797, 'epoch': 3.6}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18834495544433594, 'eval_runtime': 48.0935, 'eval_samples_per_second': 49.882, 'eval_steps_per_second': 3.119, 'epoch': 4.0}
{'loss': 0.1862, 'grad_norm': 0.6543060541152954, 'learning_rate': 0.0004056603773584906, 'epoch': 4.5}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1833305060863495, 'eval_runtime': 48.0757, 'eval_samples_per_second': 49.9, 'eval_steps_per_second': 3.12, 'epoch': 5.0}
{'loss': 0.183, 'grad_norm': 0.4164969027042389, 'learning_rate': 0.00038207547169811324, 'epoch': 5.41}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1823171228170395, 'eval_runtime': 48.2011, 'eval_samples_per_second': 49.771, 'eval_steps_per_second': 3.112, 'epoch': 6.0}
{'loss': 0.1802, 'grad_norm': 0.7763789892196655, 'learning_rate': 0.0003584905660377358, 'epoch': 6.31}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1852874904870987, 'eval_runtime': 47.9736, 'eval_samples_per_second': 50.007, 'eval_steps_per_second': 3.127, 'epoch': 7.0}
{'loss': 0.1791, 'grad_norm': 0.3979093134403229, 'learning_rate': 0.00033490566037735846, 'epoch': 7.21}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1769798994064331, 'eval_runtime': 47.678, 'eval_samples_per_second': 50.317, 'eval_steps_per_second': 3.146, 'epoch': 8.0}
{'loss': 0.1777, 'grad_norm': 0.6869238615036011, 'learning_rate': 0.00031132075471698115, 'epoch': 8.11}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17619210481643677, 'eval_runtime': 47.9822, 'eval_samples_per_second': 49.998, 'eval_steps_per_second': 3.126, 'epoch': 9.0}
{'loss': 0.1746, 'grad_norm': 0.43296870589256287, 'learning_rate': 0.0002877358490566038, 'epoch': 9.01}
{'loss': 0.1734, 'grad_norm': 0.4715780019760132, 'learning_rate': 0.0002641509433962264, 'epoch': 9.91}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18124078214168549, 'eval_runtime': 48.1806, 'eval_samples_per_second': 49.792, 'eval_steps_per_second': 3.113, 'epoch': 10.0}
{'loss': 0.1722, 'grad_norm': 0.47602930665016174, 'learning_rate': 0.00024056603773584906, 'epoch': 10.81}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17378279566764832, 'eval_runtime': 48.0847, 'eval_samples_per_second': 49.891, 'eval_steps_per_second': 3.119, 'epoch': 11.0}
{'loss': 0.1696, 'grad_norm': 0.5010862946510315, 'learning_rate': 0.00021698113207547172, 'epoch': 11.71}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17848119139671326, 'eval_runtime': 47.8209, 'eval_samples_per_second': 50.166, 'eval_steps_per_second': 3.137, 'epoch': 12.0}
{'loss': 0.1725, 'grad_norm': 0.4429420232772827, 'learning_rate': 0.00019339622641509436, 'epoch': 12.61}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17536672949790955, 'eval_runtime': 48.3399, 'eval_samples_per_second': 49.628, 'eval_steps_per_second': 3.103, 'epoch': 13.0}
{'loss': 0.1662, 'grad_norm': 0.4667406380176544, 'learning_rate': 0.00016981132075471697, 'epoch': 13.51}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17265230417251587, 'eval_runtime': 48.3758, 'eval_samples_per_second': 49.591, 'eval_steps_per_second': 3.101, 'epoch': 14.0}
{'loss': 0.1677, 'grad_norm': 0.44861021637916565, 'learning_rate': 0.00014622641509433963, 'epoch': 14.41}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17249636352062225, 'eval_runtime': 48.134, 'eval_samples_per_second': 49.84, 'eval_steps_per_second': 3.116, 'epoch': 15.0}
{'loss': 0.1659, 'grad_norm': 0.3577490746974945, 'learning_rate': 0.00012264150943396227, 'epoch': 15.32}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.17365993559360504, 'eval_runtime': 48.4836, 'eval_samples_per_second': 49.481, 'eval_steps_per_second': 3.094, 'epoch': 16.0}
{'loss': 0.1648, 'grad_norm': 0.42295268177986145, 'learning_rate': 9.905660377358492e-05, 'epoch': 16.22}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1724846065044403, 'eval_runtime': 48.616, 'eval_samples_per_second': 49.346, 'eval_steps_per_second': 3.085, 'epoch': 17.0}
{'loss': 0.1647, 'grad_norm': 0.4354194104671478, 'learning_rate': 7.547169811320755e-05, 'epoch': 17.12}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1699075996875763, 'eval_runtime': 48.0436, 'eval_samples_per_second': 49.934, 'eval_steps_per_second': 3.122, 'epoch': 18.0}
{'loss': 0.1636, 'grad_norm': 0.43467479944229126, 'learning_rate': 5.1886792452830194e-05, 'epoch': 18.02}
{'loss': 0.1629, 'grad_norm': 0.41756224632263184, 'learning_rate': 2.830188679245283e-05, 'epoch': 18.92}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.16995683312416077, 'eval_runtime': 48.3741, 'eval_samples_per_second': 49.593, 'eval_steps_per_second': 3.101, 'epoch': 19.0}
{'loss': 0.1613, 'grad_norm': 0.41733357310295105, 'learning_rate': 4.716981132075472e-06, 'epoch': 19.82}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1708691567182541, 'eval_runtime': 47.2619, 'eval_samples_per_second': 50.76, 'eval_steps_per_second': 3.174, 'epoch': 20.0}


Could not locate the best model at ./results_temp/checkpoint-9990/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 5543.3252, 'train_samples_per_second': 31.992, 'train_steps_per_second': 2.002, 'train_loss': 0.17916240554672103, 'epoch': 20.0}


FileNotFoundError: [Errno 2] No such file or directory: './results_temp/checkpoint-9990'

In [None]:
print("\nTOPIC CLASSIFICATION (CYBERBERT):")
print("=" * 60)

cb_mask = target == 1
topic_names = ['disability', 'gender', 'intellectual', 'other', 'physical', 
               'political', 'race', 'religious', 'sexual', 'social_status']

train_texts_topic = text_data[train_idx & cb_mask]
test_texts_topic = text_data[test_idx & cb_mask]
train_labels_topic = topic_labels[train_idx & cb_mask].values
test_labels_topic = topic_labels[test_idx & cb_mask].values

if len(train_labels_topic) > 0 and len(test_labels_topic) > 0:
    pred_t, pred_probs_t = train_multilabel_transformer(
        'unitary/toxic-bert',
        BertTokenizer,
        BertForSequenceClassification,
        train_texts_topic,
        train_labels_topic,
        test_texts_topic,
        test_labels_topic,
        num_labels=10
    )

    metrics_t = save_multilabel_metrics(
        test_labels_topic, pred_t, pred_probs_t,
        topic_names,
        'topic_classification',
        'cyberbert'
    )

    print(f"CyberBERT: Subset Acc={metrics_t['subset_accuracy']:.3f}, Macro F1={metrics_t['macro_f1']:.3f}, Macro Balanced Acc={metrics_t['macro_balanced_accuracy']:.3f}")
    if 'macro_auroc' in metrics_t:
        print(f"Macro AUROC={metrics_t['macro_auroc']:.3f}")

    print("\nPer-topic results:")
    for metric in metrics_t['per_topic_metrics']:
        print(f"{metric['topic']}: P={metric['precision']:.3f}, R={metric['recall']:.3f}, F1={metric['f1']:.3f}, Balanced Acc={metric['balanced_accuracy']:.3f}, Support={metric['support']}")
else:
    print("CyberBERT: No data available")
