In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    balanced_accuracy_score, roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results_dir = 'results/roberta'
os.makedirs(results_dir, exist_ok=True)
device

In [None]:
fold_num = 2
train_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_train.csv')
test_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_test.csv')

train_data = train_data.replace({'f': 0, 't': 1})
test_data = test_data.replace({'f': 0, 't': 1})

data = pd.concat([train_data, test_data], ignore_index=True)
train_idx = pd.Series([True] * len(train_data) + [False] * len(test_data))
test_idx = pd.Series([False] * len(train_data) + [True] * len(test_data))

print(f"Using fold {fold_num} - Train: {len(train_data)}, Test: {len(test_data)}, Total: {len(data)}")
data.shape

In [None]:
def get_target(data):
    return data['c_cyberbullying_majority'].astype(int)

def create_severity_labels(data):
    severity_cols = ['c_severity_mild_count', 'c_severity_moderate_count', 'c_severity_severe_count']
    severity_counts = data[severity_cols].fillna(0)
    cb_mask = data['c_cyberbullying_majority'] == 1
    severity_labels = np.zeros(len(data))
    
    for idx in data.index:
        if not cb_mask.iloc[idx]:
            severity_labels[idx] = 0
        else:
            row = severity_counts.iloc[idx]
            if row.sum() == 0:
                severity_labels[idx] = 0
            else:
                max_idx = row.argmax()
                severity_labels[idx] = max_idx + 1
    return severity_labels.astype(int)

def create_topic_labels(data):
    topic_cols = ['c_topic_disability_majority', 'c_topic_gender_majority', 'c_topic_intellectual_majority',
                  'c_topic_other_majority', 'c_topic_physical_majority', 'c_topic_political_majority',
                  'c_topic_race_majority', 'c_topic_religious_majority', 'c_topic_sexual_majority',
                  'c_topic_social_status_majority']
    topic_data = data[topic_cols].fillna(0).astype(int)
    cb_mask = data['c_cyberbullying_majority'] == 1
    topic_labels = topic_data.copy()
    topic_labels[~cb_mask] = 0
    return topic_labels, topic_cols

def create_role_labels(data):
    cb_roles = ['c_role_bully_count', 'c_role_cb__bully_assistant_count', 
                'c_role_cb_aggressive_victim_role_count', 'c_role_cb_aggressive_defender_count']
    noncb_roles = ['c_role_noncb_passive_bystander_count', 'c_role_noncb_non_aggressive_victim_count',
                   'c_role_noncb_non_aggressive_defender_count']
    role_labels = []
    cb_mask = data['c_cyberbullying_majority'] == 1
    
    for idx in data.index:
        if cb_mask.iloc[idx]:
            role_counts = data[cb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role)
            else:
                role_labels.append(0)
        else:
            role_counts = data[noncb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role + 4)
            else:
                role_labels.append(4)
    return np.array(role_labels)

def save_metrics(y_true, y_pred, y_proba, class_names, task_name, model_name='roberta'):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    
    metrics['precision_macro'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['recall_macro'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    metrics['precision_weighted'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['recall_weighted'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    metrics['precision_per_class'] = precision_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['recall_per_class'] = recall_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['f1_per_class'] = f1_score(y_true, y_pred, average=None, zero_division=0).tolist()
    
    if len(np.unique(y_true)) == 2:
        if y_proba is not None and y_proba.shape[1] >= 2:
            metrics['auroc'] = roc_auc_score(y_true, y_proba[:, 1])
        else:
            metrics['auroc'] = None
    else:
        if y_proba is not None:
            try:
                metrics['auroc_macro'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
                metrics['auroc_weighted'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
            except:
                metrics['auroc_macro'] = None
                metrics['auroc_weighted'] = None
        else:
            metrics['auroc_macro'] = None
            metrics['auroc_weighted'] = None
    
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'{model_name.upper()} - {task_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'{results_dir}/{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    with open(f'{results_dir}/{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    np.save(f'{results_dir}/{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.npy', cm)
    
    print(f"Saved metrics for {task_name} to {results_dir}")
    
    return metrics

In [None]:
target = get_target(data)
severity_labels = create_severity_labels(data)
topic_labels, topic_cols = create_topic_labels(data)
role_labels = create_role_labels(data)

text_data = data['c_comment_content'].fillna('').astype(str)

f"Data: {data.shape}, Train: {train_idx.sum()}, Test: {test_idx.sum()}"

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
def freeze_base_model(model):
    for name, param in model.named_parameters():
        if 'classifier' not in name and 'pooler' not in name:
            param.requires_grad = False
    return model

def train_transformer_model(model_name, tokenizer_class, model_class, train_texts, train_labels, test_texts, test_labels, num_labels, task_name):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(
        model_name, 
        num_labels=num_labels,
        ignore_mismatched_sizes=True,
        problem_type="single_label_classification",
        classifier_dropout=0.1
    ).to(device)
    
    model = freeze_base_model(model)
    
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextDataset(test_texts, test_labels, tokenizer)
    
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    
    trainer.train()
    
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    pred_labels = np.argmax(predictions.predictions, axis=1)
    
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return pred_labels, pred_probs

In [None]:
print("BINARY CYBERBULLYING CLASSIFICATION:")
print("=" * 50)

train_texts = text_data[train_idx]
test_texts = text_data[test_idx]
train_labels = target[train_idx].values
test_labels = target[test_idx].values

pred, pred_probs = train_transformer_model(
    'roberta-base',
    RobertaTokenizer,
    RobertaForSequenceClassification,
    train_texts,
    train_labels,
    test_texts,
    test_labels,
    num_labels=2,
    task_name='binary_classification'
)

metrics = save_metrics(
    test_labels, pred, pred_probs, 
    ['Non-CB', 'CB'], 
    'binary_classification'
)

print(f"RoBERTa: Acc={metrics['accuracy']:.3f}, Balanced Acc={metrics['balanced_accuracy']:.3f}, F1={metrics['f1_weighted']:.3f}, AUROC={metrics['auroc']:.3f}")
print(classification_report(test_labels, pred, target_names=['Non-CB', 'CB']))

In [None]:
print("\nSEVERITY CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):")
print("=" * 60)

cb_mask = target == 1
severity_names = ['mild', 'moderate', 'severe']

train_texts_sev = text_data[train_idx & cb_mask]
test_texts_sev = text_data[test_idx & cb_mask]
train_labels_sev = severity_labels[train_idx & cb_mask] - 1  # Convert 1,2,3 to 0,1,2
test_labels_sev = severity_labels[test_idx & cb_mask] - 1    # Convert 1,2,3 to 0,1,2

# Filter out any remaining 'none' labels (originally 0, now -1)
valid_train_mask = train_labels_sev >= 0
valid_test_mask = test_labels_sev >= 0

train_texts_sev = train_texts_sev[valid_train_mask]
test_texts_sev = test_texts_sev[valid_test_mask]
train_labels_sev = train_labels_sev[valid_train_mask]
test_labels_sev = test_labels_sev[valid_test_mask]

if len(train_labels_sev) > 0 and len(test_labels_sev) > 0:
    pred_s, pred_probs_s = train_transformer_model(
        'roberta-base',
        RobertaTokenizer,
        RobertaForSequenceClassification,
        train_texts_sev,
        train_labels_sev,
        test_texts_sev,
        test_labels_sev,
        num_labels=3,
        task_name='severity_classification'
    )
    
    metrics_s = save_metrics(
        test_labels_sev, pred_s, pred_probs_s,
        severity_names,
        'severity_classification'
    )
    
    print(f"RoBERTa: Acc={metrics_s['accuracy']:.3f}, Balanced Acc={metrics_s['balanced_accuracy']:.3f}, F1={metrics_s['f1_weighted']:.3f}")
    if metrics_s['auroc_weighted']:
        print(f"AUROC Weighted={metrics_s['auroc_weighted']:.3f}")
    print(classification_report(test_labels_sev, pred_s, target_names=severity_names, zero_division=0))
else:
    print("RoBERTa: No data available")

In [None]:
print("\nROLE CLASSIFICATION (ALL COMMENTS):")
print("=" * 40)

role_names = ['bully', 'bully_assistant', 'aggressive_victim', 'aggressive_defender',
              'passive_bystander', 'non_aggressive_victim', 'non_aggressive_defender']

train_texts_role = text_data[train_idx]
test_texts_role = text_data[test_idx]
train_labels_role = role_labels[train_idx]
test_labels_role = role_labels[test_idx]

pred_r, pred_probs_r = train_transformer_model(
    'roberta-base',
    RobertaTokenizer,
    RobertaForSequenceClassification,
    train_texts_role,
    train_labels_role,
    test_texts_role,
    test_labels_role,
    num_labels=7,
    task_name='role_classification'
)

metrics_r = save_metrics(
    test_labels_role, pred_r, pred_probs_r,
    role_names,
    'role_classification'
)

print(f"RoBERTa: Acc={metrics_r['accuracy']:.3f}, Balanced Acc={metrics_r['balanced_accuracy']:.3f}, F1={metrics_r['f1_weighted']:.3f}")
if metrics_r['auroc_weighted']:
    print(f"AUROC Weighted={metrics_r['auroc_weighted']:.3f}")
print(classification_report(test_labels_role, pred_r, target_names=role_names, zero_division=0))

In [13]:
class MultiLabelTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        text = text.strip() if text and text != 'nan' else ""
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class MultiLabelTransformer(nn.Module):
    def __init__(self, model_name, num_labels, model_class):
        super().__init__()
        self.transformer = model_class.from_pretrained(
            model_name, 
            num_labels=num_labels,
            ignore_mismatched_sizes=True,
            problem_type="multi_label_classification"
        )
        
        for name, param in self.transformer.named_parameters():
            if 'classifier' not in name and 'pooler' not in name:
                param.requires_grad = False
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = self.sigmoid(logits)
        
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())
            return {'loss': loss, 'logits': logits}
        
        return {'logits': logits}

def train_multilabel_transformer(model_name, tokenizer_class, model_class, train_texts, train_labels, test_texts, test_labels, num_labels):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = MultiLabelTransformer(model_name, num_labels, model_class).to(device)
    
    train_dataset = MultiLabelTextDataset(train_texts, train_labels, tokenizer)
    test_dataset = MultiLabelTextDataset(test_texts, test_labels, tokenizer)
    
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=500,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None,
        save_total_limit=1,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    
    print("Training multi-label model...")
    trainer.train()
    
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.sigmoid(torch.tensor(predictions.predictions))
    pred_labels = (pred_probs > 0.5).numpy().astype(int)
    
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return pred_labels, pred_probs.numpy()

def save_multilabel_metrics(y_true, y_pred, y_proba, topic_names, task_name, model_name='roberta'):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    topic_metrics = []
    balanced_accuracies = []
    
    for i, topic_name in enumerate(topic_names):
        p, r, f1, _ = precision_recall_fscore_support(
            y_true[:, i], y_pred[:, i], average='binary', zero_division=0
        )
        support = y_true[:, i].sum()
        
        balanced_acc = balanced_accuracy_score(y_true[:, i], y_pred[:, i])
        balanced_accuracies.append(balanced_acc)
        
        try:
            auroc = roc_auc_score(y_true[:, i], y_proba[:, i])
        except:
            auroc = None
        
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
        plt.title(f'{model_name.upper()} - {topic_name} Topic - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{results_dir}/{task_name}_{topic_name}_confusion_matrix_{timestamp}_fold_{fold_num}.png', 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        np.save(f'{results_dir}/{task_name}_{topic_name}_confusion_matrix_{timestamp}_fold_{fold_num}.npy', cm)
            
        topic_metrics.append({
            'topic': topic_name,
            'precision': p,
            'recall': r,
            'f1': f1,
            'balanced_accuracy': balanced_acc,
            'auroc': auroc,
            'support': support
        })
    
    subset_accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    
    precisions = [m['precision'] for m in topic_metrics]
    recalls = [m['recall'] for m in topic_metrics]
    f1s = [m['f1'] for m in topic_metrics]
    aurocs = [m['auroc'] for m in topic_metrics if m['auroc'] is not None]
    
    metrics['subset_accuracy'] = subset_accuracy
    metrics['macro_precision'] = np.mean(precisions)
    metrics['macro_recall'] = np.mean(recalls)
    metrics['macro_f1'] = np.mean(f1s)
    metrics['macro_balanced_accuracy'] = np.mean(balanced_accuracies)
    if aurocs:
        metrics['macro_auroc'] = np.mean(aurocs)
    
    metrics['per_topic_metrics'] = topic_metrics
    
    with open(f'{results_dir}/{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2, default=str)
    
    print(f"Saved multi-label metrics for {task_name} to {results_dir}")
    
    return metrics

In [14]:
print("\nTOPIC CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):")
print("=" * 60)

cb_mask = target == 1
topic_names = ['disability', 'gender', 'intellectual', 'other', 'physical', 
               'political', 'race', 'religious', 'sexual', 'social_status']

train_texts_topic = text_data[train_idx & cb_mask]
test_texts_topic = text_data[test_idx & cb_mask]
train_labels_topic = topic_labels[train_idx & cb_mask].values
test_labels_topic = topic_labels[test_idx & cb_mask].values

if len(train_labels_topic) > 0 and len(test_labels_topic) > 0:
    pred_t, pred_probs_t = train_multilabel_transformer(
        'roberta-base',
        RobertaTokenizer,
        RobertaForSequenceClassification,
        train_texts_topic,
        train_labels_topic,
        test_texts_topic,
        test_labels_topic,
        num_labels=10
    )
    
    metrics_t = save_multilabel_metrics(
        test_labels_topic, pred_t, pred_probs_t,
        topic_names,
        'topic_classification',
        'roberta'
    )
    
    print(f"RoBERTa: Subset Acc={metrics_t['subset_accuracy']:.3f}, Macro F1={metrics_t['macro_f1']:.3f}, Macro Balanced Acc={metrics_t['macro_balanced_accuracy']:.3f}")
    if 'macro_auroc' in metrics_t:
        print(f"Macro AUROC={metrics_t['macro_auroc']:.3f}")
    
    print("\nPer-topic results:")
    for metric in metrics_t['per_topic_metrics']:
        print(f"{metric['topic']}: P={metric['precision']:.3f}, R={metric['recall']:.3f}, F1={metric['f1']:.3f}, Balanced Acc={metric['balanced_accuracy']:.3f}, Support={metric['support']}")
else:
    print("RoBERTa: No data available")


TOPIC CLASSIFICATION (CYBERBULLYING COMMENTS ONLY):


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training multi-label model...


  0%|          | 0/11100 [00:00<?, ?it/s]

{'loss': 0.3128, 'grad_norm': 0.2806260287761688, 'learning_rate': 0.0005, 'epoch': 0.9}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.2467968910932541, 'eval_runtime': 28.5129, 'eval_samples_per_second': 84.137, 'eval_steps_per_second': 5.261, 'epoch': 1.0}
{'loss': 0.2367, 'grad_norm': 0.20464543998241425, 'learning_rate': 0.00047641509433962265, 'epoch': 1.8}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.2291584461927414, 'eval_runtime': 28.4003, 'eval_samples_per_second': 84.471, 'eval_steps_per_second': 5.282, 'epoch': 2.0}
{'loss': 0.2278, 'grad_norm': 0.3070812523365021, 'learning_rate': 0.0004528301886792453, 'epoch': 2.7}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.21958403289318085, 'eval_runtime': 28.6388, 'eval_samples_per_second': 83.767, 'eval_steps_per_second': 5.238, 'epoch': 3.0}
{'loss': 0.2195, 'grad_norm': 0.3156946897506714, 'learning_rate': 0.00042924528301886797, 'epoch': 3.6}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.21027718484401703, 'eval_runtime': 28.2884, 'eval_samples_per_second': 84.805, 'eval_steps_per_second': 5.303, 'epoch': 4.0}
{'loss': 0.2166, 'grad_norm': 0.5019981265068054, 'learning_rate': 0.0004056603773584906, 'epoch': 4.5}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.2022838145494461, 'eval_runtime': 28.553, 'eval_samples_per_second': 84.019, 'eval_steps_per_second': 5.253, 'epoch': 5.0}
{'loss': 0.2099, 'grad_norm': 0.3687780797481537, 'learning_rate': 0.00038207547169811324, 'epoch': 5.41}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19931450486183167, 'eval_runtime': 28.5062, 'eval_samples_per_second': 84.157, 'eval_steps_per_second': 5.262, 'epoch': 6.0}
{'loss': 0.2056, 'grad_norm': 0.3974687457084656, 'learning_rate': 0.0003584905660377358, 'epoch': 6.31}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1945003867149353, 'eval_runtime': 28.4257, 'eval_samples_per_second': 84.395, 'eval_steps_per_second': 5.277, 'epoch': 7.0}
{'loss': 0.2055, 'grad_norm': 0.3424472510814667, 'learning_rate': 0.00033490566037735846, 'epoch': 7.21}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19173549115657806, 'eval_runtime': 28.3264, 'eval_samples_per_second': 84.691, 'eval_steps_per_second': 5.295, 'epoch': 8.0}
{'loss': 0.2023, 'grad_norm': 0.3521493077278137, 'learning_rate': 0.00031132075471698115, 'epoch': 8.11}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19038289785385132, 'eval_runtime': 28.3036, 'eval_samples_per_second': 84.76, 'eval_steps_per_second': 5.3, 'epoch': 9.0}
{'loss': 0.2011, 'grad_norm': 0.41174817085266113, 'learning_rate': 0.0002877358490566038, 'epoch': 9.01}
{'loss': 0.1994, 'grad_norm': 0.46503838896751404, 'learning_rate': 0.0002641509433962264, 'epoch': 9.91}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19291165471076965, 'eval_runtime': 28.5455, 'eval_samples_per_second': 84.041, 'eval_steps_per_second': 5.255, 'epoch': 10.0}
{'loss': 0.2002, 'grad_norm': 0.35034558176994324, 'learning_rate': 0.00024056603773584906, 'epoch': 10.81}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.19094610214233398, 'eval_runtime': 28.4986, 'eval_samples_per_second': 84.179, 'eval_steps_per_second': 5.263, 'epoch': 11.0}
{'loss': 0.1978, 'grad_norm': 0.5116296410560608, 'learning_rate': 0.00021698113207547172, 'epoch': 11.71}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18959462642669678, 'eval_runtime': 30.8535, 'eval_samples_per_second': 77.755, 'eval_steps_per_second': 4.862, 'epoch': 12.0}
{'loss': 0.2002, 'grad_norm': 0.33327987790107727, 'learning_rate': 0.00019339622641509436, 'epoch': 12.61}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18833553791046143, 'eval_runtime': 30.4169, 'eval_samples_per_second': 78.87, 'eval_steps_per_second': 4.931, 'epoch': 13.0}
{'loss': 0.1942, 'grad_norm': 0.38559725880622864, 'learning_rate': 0.00016981132075471697, 'epoch': 13.51}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18678200244903564, 'eval_runtime': 30.4584, 'eval_samples_per_second': 78.763, 'eval_steps_per_second': 4.925, 'epoch': 14.0}
{'loss': 0.1967, 'grad_norm': 0.29621902108192444, 'learning_rate': 0.00014622641509433963, 'epoch': 14.41}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18613189458847046, 'eval_runtime': 30.4223, 'eval_samples_per_second': 78.857, 'eval_steps_per_second': 4.931, 'epoch': 15.0}
{'loss': 0.195, 'grad_norm': 0.3975958526134491, 'learning_rate': 0.00012264150943396227, 'epoch': 15.32}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18669281899929047, 'eval_runtime': 30.2952, 'eval_samples_per_second': 79.187, 'eval_steps_per_second': 4.951, 'epoch': 16.0}
{'loss': 0.1955, 'grad_norm': 0.45754286646842957, 'learning_rate': 9.905660377358492e-05, 'epoch': 16.22}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18909522891044617, 'eval_runtime': 30.8465, 'eval_samples_per_second': 77.772, 'eval_steps_per_second': 4.863, 'epoch': 17.0}
{'loss': 0.1941, 'grad_norm': 0.3573600649833679, 'learning_rate': 7.547169811320755e-05, 'epoch': 17.12}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.1850784569978714, 'eval_runtime': 30.0582, 'eval_samples_per_second': 79.812, 'eval_steps_per_second': 4.99, 'epoch': 18.0}
{'loss': 0.1941, 'grad_norm': 0.3760596811771393, 'learning_rate': 5.1886792452830194e-05, 'epoch': 18.02}
{'loss': 0.1928, 'grad_norm': 0.5240703821182251, 'learning_rate': 2.830188679245283e-05, 'epoch': 18.92}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18545693159103394, 'eval_runtime': 30.5869, 'eval_samples_per_second': 78.432, 'eval_steps_per_second': 4.904, 'epoch': 19.0}
{'loss': 0.1926, 'grad_norm': 0.3731459677219391, 'learning_rate': 4.716981132075472e-06, 'epoch': 19.82}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.18524326384067535, 'eval_runtime': 20.5195, 'eval_samples_per_second': 116.913, 'eval_steps_per_second': 7.31, 'epoch': 20.0}


Could not locate the best model at ./results_temp/checkpoint-9990/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 3362.9423, 'train_samples_per_second': 52.734, 'train_steps_per_second': 3.301, 'train_loss': 0.20853446754249366, 'epoch': 20.0}


FileNotFoundError: [Errno 2] No such file or directory: './results_temp/checkpoint-9990'