In [1]:
MODEL_NAME = 'roberta'
DATA_DIR = '5folds'
FILE_PATTERN = 'ml_data_part0{part_num}.csv'
RESULTS_DIR = f'results/{MODEL_NAME}'
NUM_FOLDS = 5

config = {
    'model_name': 'unitary/toxic-bert',
    'tokenizer_class': 'AutoTokenizer',
    'model_class': 'AutoModelForSequenceClassification'
}
print(f"Using model: {config['model_name']}")
print(f"Results will be saved to: {RESULTS_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"File pattern: {FILE_PATTERN}")


Using model: unitary/toxic-bert
Results will be saved to: results/roberta
Data directory: 5folds
File pattern: ml_data_part0{part_num}.csv


## Task Configuration

In [2]:
TASK_CONFIG = {
    'binary': {
        'name': 'Binary Cyberbullying Classification',
        'enabled': True,
        'num_labels': 2,
        'class_names': ['Non-CB', 'CB'],
        'filter_cb_only': False,
        'problem_type': 'single_label',
        'training_function': 'train_transformer_model'
    },
    'severity': {
        'name': 'Severity Classification',
        'enabled': True,
        'num_labels': 3,
        'class_names': ['mild', 'moderate', 'severe'],
        'filter_cb_only': True,  # CB only, 3 classes
        'problem_type': 'single_label',
        'training_function': 'train_transformer_model'
    },
    'role': {
        'name': 'Role Classification',
        'enabled': True,
        'num_labels': 7,
        'class_names': ['bully', 'bully_assistant', 'aggressive_victim', 'aggressive_defender',
                       'passive_bystander', 'non_aggressive_victim', 'aggressive_defender_noncb'],
        'filter_cb_only': False,
        'problem_type': 'single_label',
        'training_function': 'train_transformer_model'
    },
    'topic': {
        'name': 'Topic Classification (Multi-label)',
        'enabled': True,
        'num_labels': 10,
        'topic_names': ['disability', 'gender', 'intellectual', 'other', 'physical',
                       'political', 'race', 'religious', 'sexual', 'social_status'],
        'filter_cb_only': True,
        'problem_type': 'multi_label',
        'training_function': 'train_multilabel_model'
    }
}
for task_key, task_info in TASK_CONFIG.items():
    status = "ENABLED" if task_info['enabled'] else "DISABLED"


## Imports

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    balanced_accuracy_score, precision_score,
    recall_score, f1_score
)
import json
import os
from datetime import datetime
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(RESULTS_DIR, exist_ok=True)


## Helper Functions

In [4]:
def get_target(data):
    return (data['cyberbullying'] == 't').astype(int)
def create_severity_labels(data):
    severity_map = {'no_severity': 0, 'mild': 1, 'moderate': 2, 'severe': 3}
    severity_labels = data['bullying_severity'].map(severity_map).fillna(0).astype(int)
    return severity_labels
def create_topic_labels_multilabel(data):
    topic_cols = ['has_disability', 'has_gender', 'has_intellectual', 'has_none', 'has_physical',
                  'has_political', 'has_race', 'has_religious', 'has_sexual', 'has_social_status']
    topic_labels = data[topic_cols].apply(lambda col: (col == 't').astype(int))
    return topic_labels.values, topic_cols
def create_role_labels(data):
    role_map = {
        'bully': 0,
        'bully_assistant': 1,
        'aggressive_victim': 2,
        'aggressive_defender': 3,
        'passive_bystander': 4,
        'non_aggressive_victim': 5,
        'aggressive_defender_noncb': 6
    }
    role_labels = data['bullying_role'].map(role_map).fillna(4).astype(int)  
    return role_labels.values

## Metrics Functions

In [5]:
def save_metrics(y_true, y_pred, y_proba, class_names, task_name, fold_num, model_name=MODEL_NAME):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['f1_micro'] = f1_score(y_true, y_pred, average='micro', zero_division=0)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    if len(np.unique(y_true)) == 2:
        metrics['f1_binary'] = f1_score(y_true, y_pred, average='binary', zero_division=0)
    metrics['f1_per_class'] = f1_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['precision_macro'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['precision_weighted'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['recall_macro'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['recall_weighted'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['precision_per_class'] = precision_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['recall_per_class'] = recall_score(y_true, y_pred, average=None, zero_division=0).tolist()
    try:
        from sklearn.metrics import roc_auc_score
        from sklearn.preprocessing import label_binarize
        num_classes = len(np.unique(y_true))
        if num_classes == 2:
            if y_proba.ndim > 1 and y_proba.shape[1] == 2:
                metrics['auroc'] = roc_auc_score(y_true, y_proba[:, 1])
            else:
                metrics['auroc'] = roc_auc_score(y_true, y_proba)
        else:
            num_total_classes = len(class_names) if class_names is not None else (y_proba.shape[1] if y_proba is not None and y_proba.ndim > 1 else num_classes)
            per_class_auroc = [None] * num_total_classes
            aucs = []
            supports = []
            for c in range(num_total_classes):
                y_bin = (y_true == c).astype(int)
                if np.unique(y_bin).size < 2:
                    per_class_auroc[c] = None
                    continue
                try:
                    auc_c = roc_auc_score(y_bin, y_proba[:, c])
                    per_class_auroc[c] = float(auc_c)
                    aucs.append(auc_c)
                    supports.append(int(y_bin.sum()))
                except Exception:
                    per_class_auroc[c] = None
            metrics['auroc_per_class'] = per_class_auroc
            if aucs:
                metrics['auroc_macro'] = float(np.mean(aucs))
                total = float(np.sum(supports)) if supports else 0.0
                metrics['auroc_weighted'] = float(np.sum([a*w for a,w in zip(aucs, supports)]) / total) if total > 0 else float(np.mean(aucs))
    except Exception as e:
        pass
    with open(f'{RESULTS_DIR}/{task_name}_metrics_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2, default=str)
    return metrics
def save_multilabel_metrics(y_true, y_pred, y_proba, topic_names, task_name, fold_num, model_name=MODEL_NAME):
    metrics = {}
    topic_metrics = []
    balanced_accuracies = []
    aurocs = []
    balanced_accuracy_per_topic = {}
    auroc_per_topic = {}
    f1_per_topic = {}
    for i, topic_name in enumerate(topic_names):
        if y_true[:, i].sum() == 0:
            continue
        p, r, f1, _ = precision_recall_fscore_support(
            y_true[:, i], y_pred[:, i], average='binary', zero_division=0
        )
        support = y_true[:, i].sum()
        balanced_acc = balanced_accuracy_score(y_true[:, i], y_pred[:, i])
        balanced_accuracies.append(balanced_acc)
        balanced_accuracy_per_topic[topic_name] = float(balanced_acc)
        f1_per_topic[topic_name] = float(f1)
        try:
            from sklearn.metrics import roc_auc_score
            if y_proba.ndim > 1 and y_proba.shape[1] > i:
                auroc = roc_auc_score(y_true[:, i], y_proba[:, i])
                aurocs.append(auroc)
            else:
                auroc = np.nan
            auroc_per_topic[topic_name] = None if (isinstance(auroc, float) and np.isnan(auroc)) else float(auroc)
        except:
            auroc = np.nan
        auroc_per_topic[topic_name] = None if (isinstance(auroc, float) and np.isnan(auroc)) else float(auroc)
        topic_metrics.append({
            'topic': topic_name,
            'precision': p,
            'recall': r,
            'f1': f1,
            'f1_macro': f1,  
            'f1_micro': f1,   
            'balanced_accuracy': balanced_acc,
            'auroc': auroc,
            'support': support
        })
    subset_accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    subset_balanced_accuracy = np.mean(balanced_accuracies)
    precisions = [m['precision'] for m in topic_metrics]
    recalls = [m['recall'] for m in topic_metrics]
    f1s = [m['f1'] for m in topic_metrics]
    metrics['subset_accuracy'] = subset_accuracy
    metrics['subset_balanced_accuracy'] = subset_balanced_accuracy
    metrics['macro_precision'] = np.mean(precisions)
    metrics['macro_recall'] = np.mean(recalls)
    metrics['macro_f1'] = np.mean(f1s)
    metrics['macro_balanced_accuracy'] = np.mean(balanced_accuracies)
    metrics['balanced_accuracy_per_topic'] = balanced_accuracy_per_topic
    metrics['f1_per_topic'] = f1_per_topic
    metrics['auroc_per_topic'] = auroc_per_topic
    if aurocs:
        metrics['macro_auroc'] = np.mean([a for a in aurocs if not np.isnan(a)])
    from sklearn.metrics import f1_score
    metrics['micro_f1'] = f1_score(y_true, y_pred, average='micro', zero_division=0)
    metrics['weighted_f1'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['per_topic_metrics'] = topic_metrics
    with open(f'{RESULTS_DIR}/{task_name}_metrics_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2, default=str)
    return metrics
def aggregate_metrics(all_fold_metrics):
    if not all_fold_metrics:
        return {}
    agg = {}
    first_fold = all_fold_metrics[0]
    metric_keys = [
        k for k in first_fold.keys()
        if k not in ['per_topic_metrics', 'f1_per_class', 'precision_per_class', 'recall_per_class']
    ]
    for key in metric_keys:
        values = [
            m[key] for m in all_fold_metrics
            if key in m
            and isinstance(m[key], (int, float, np.integer, np.floating))
            and not np.isnan(m[key])
        ]
        if values:
            agg[f'{key}_mean'] = float(np.mean(values))
            agg[f'{key}_std']  = float(np.std(values))
    return agg

## Dataset Classes

In [14]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels.iloc[idx] if hasattr(self.labels, 'iloc') else self.labels[idx], dtype=torch.long)
        }
class MultiLabelTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        text = text.strip() if text and text != 'nan' else ""
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx] if not hasattr(self.labels, 'iloc') else self.labels.iloc[idx], dtype=torch.float)
        }

## Model Training Functions

In [15]:
def freeze_base_model(model):
    for name, param in model.named_parameters():
        if 'classifier' not in name and 'pooler' not in name:
            param.requires_grad = False
    return model

def get_tokenizer_and_model(config, num_labels, problem_type="single_label_classification"):
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
    model = AutoModelForSequenceClassification.from_pretrained(
        config['model_name'],
        num_labels=num_labels,
        ignore_mismatched_sizes=True,
        problem_type=problem_type,
        classifier_dropout=0.1
    ).to(device)
    return tokenizer, model

def train_transformer_model(train_texts, train_labels, test_texts, test_labels, num_labels, task_name):
    tokenizer, model = get_tokenizer_and_model(config, num_labels, "single_label_classification")
    model = freeze_base_model(model)
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextDataset(test_texts, test_labels, tokenizer)
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    trainer.train()
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    pred_labels = np.argmax(predictions.predictions, axis=1)
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    return pred_labels, pred_probs

class MultiLabelTransformer(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

def train_multilabel_model(train_texts, train_labels, test_texts, test_labels, num_labels, task_name):
    tokenizer, _ = get_tokenizer_and_model(config, num_labels, "multi_label_classification")
    model = MultiLabelTransformer(
        config['model_name'],
        num_labels
    ).to(device)
    model = freeze_base_model(model)
    train_dataset = MultiLabelTextDataset(train_texts, train_labels, tokenizer)
    test_dataset = MultiLabelTextDataset(test_texts, test_labels, tokenizer)
    training_args = TrainingArguments(
        output_dir='./results_temp',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=5e-4,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    trainer.train()
    predictions = trainer.predict(test_dataset)
    pred_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
    pred_labels = (pred_probs > 0.5).astype(int)
    del model, trainer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    return pred_labels, pred_probs

In [None]:
all_results = {task_key: [] for task_key in TASK_CONFIG.keys()}
for fold_num in range(1, NUM_FOLDS + 1):
    pass
    all_partitions = []
    for part_num in range(1, 6):
        file_path = f"{DATA_DIR}/{FILE_PATTERN.format(part_num=part_num)}"
        partition = pd.read_csv(file_path)
        all_partitions.append(partition)
    test_data = all_partitions[fold_num - 1]
    train_data = pd.concat([all_partitions[i] for i in range(5) if i != (fold_num - 1)], ignore_index=True)
    data = pd.concat([train_data, test_data], ignore_index=True)
    train_idx = np.array([True] * len(train_data) + [False] * len(test_data))
    test_idx = np.array([False] * len(train_data) + [True] * len(test_data))
    text_data = data['comment_content'].fillna('')
    target = get_target(data)
    severity_labels = create_severity_labels(data)
    role_labels = create_role_labels(data)
    topic_labels, topic_cols = create_topic_labels_multilabel(data)
    for task_key, task_config in TASK_CONFIG.items():
        if not task_config['enabled']:
            pass
            continue
        if task_key == 'binary':
            train_texts = text_data[train_idx].reset_index(drop=True)
            test_texts = text_data[test_idx].reset_index(drop=True)
            train_labels = target[train_idx].values
            test_labels = target[test_idx].values
            num_labels = task_config['num_labels']
            class_names = task_config['class_names']
        elif task_key == 'severity':
            cb_mask = (target == 1).values
            cb_train_idx = train_idx & cb_mask
            cb_test_idx = test_idx & cb_mask
            if cb_train_idx.sum() == 0 or cb_test_idx.sum() == 0:
                pass
                continue
            sev_train = severity_labels[cb_train_idx] - 1
            sev_test = severity_labels[cb_test_idx] - 1
            valid_train_mask = sev_train >= 0
            valid_test_mask = sev_test >= 0
            if valid_train_mask.sum() == 0 or valid_test_mask.sum() == 0:
                pass
                continue
            train_texts = text_data[cb_train_idx][valid_train_mask].reset_index(drop=True)
            test_texts = text_data[cb_test_idx][valid_test_mask].reset_index(drop=True)
            train_labels = sev_train[valid_train_mask]
            test_labels = sev_test[valid_test_mask]
            num_labels = task_config['num_labels']
            class_names = task_config['class_names']
        elif task_key == 'role':
            train_texts = text_data[train_idx].reset_index(drop=True)
            test_texts = text_data[test_idx].reset_index(drop=True)
            train_labels = role_labels[train_idx]
            test_labels = role_labels[test_idx]
            num_labels = task_config['num_labels']
            class_names = task_config['class_names']
        elif task_key == 'topic':
            cb_train_idx = train_idx & (target == 1).values
            cb_test_idx = test_idx & (target == 1).values
            if cb_train_idx.sum() == 0 or cb_test_idx.sum() == 0:
                pass
                continue
            train_texts = text_data[cb_train_idx].reset_index(drop=True)
            test_texts = text_data[cb_test_idx].reset_index(drop=True)
            train_labels = topic_labels[cb_train_idx]
            test_labels = topic_labels[cb_test_idx]
            num_labels = task_config['num_labels']
        if task_config['training_function'] == 'train_transformer_model':
            pred, pred_probs = train_transformer_model(
                train_texts, train_labels,
                test_texts, test_labels,
                num_labels, f"{task_key}_classification"
            )
            metrics = save_metrics(
                test_labels, pred, pred_probs,
                class_names,
                f'{task_key}_classification',
                fold_num
            )
            if 'auroc' in metrics:
                pass
            if 'auroc_macro' in metrics:
                pass
        elif task_config['training_function'] == 'train_multilabel_model':
            pred, pred_probs = train_multilabel_model(
                train_texts, train_labels,
                test_texts, test_labels,
                num_labels, f"{task_key}_classification"
            )
            topic_names = task_config['topic_names']
            metrics = save_multilabel_metrics(
                test_labels, pred, pred_probs,
                topic_names,
                f'{task_key}_classification',
                fold_num
            )
            if 'balanced_accuracy_per_topic' in metrics:
                pass
                for _t, _v in metrics['balanced_accuracy_per_topic'].items():
                    if _v is None:
                        pass
                    else:
                        pass
            if 'auroc_per_topic' in metrics:
                pass
                for _t, _v in metrics['auroc_per_topic'].items():
                    if _v is None:
                        pass
                    else:
                        pass
            if 'f1_per_topic' in metrics:
                pass
                for _t, _v in metrics['f1_per_topic'].items():
                    if _v is None:
                        pass
                    else:
                        pass
            if 'macro_auroc' in metrics:
                pass
        all_results[task_key].append(metrics)



In [None]:
for task_key, task_config in TASK_CONFIG.items():
    if not task_config['enabled'] or not all_results[task_key]:
        continue
    agg = aggregate_metrics(all_results[task_key])
    if task_config['problem_type'] == 'single_label':
        pass
        if 'auroc_mean' in agg:
            pass
        if 'auroc_macro_mean' in agg:
            pass
    elif task_config['problem_type'] == 'multi_label':
        pass
        if 'macro_auroc_mean' in agg:
            pass
