In [1]:
DATA_DIR = '5folds'
FILE_PATTERN = 'ml_data_part0{part_num}.csv'
RESULTS_DIR = 'results/basicml_5fold'
NUM_FOLDS = 5

## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
import json
import os
from datetime import datetime
from collections import defaultdict
os.makedirs(RESULTS_DIR, exist_ok=True)

## Helper Functions

In [3]:
def get_target(data):
    return (data['cyberbullying'] == 't').astype(int)
    
def get_text_features(data):
    comments = data['comment_content'].fillna('').astype(str)
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    features = vectorizer.fit_transform(comments)
    return features, vectorizer

def get_severity_labels(data):
    severity_map = {'mild': 1, 'moderate': 2, 'severe': 3}
    severity_labels = data['bullying_severity'].map(severity_map).fillna(0).astype(int)
    return severity_labels
    
def get_topic_labels_multilabel(data):
    topic_cols = [
        'has_race', 'has_political', 'has_intellectual', 'has_physical',
        'has_social_status', 'has_gender', 'has_none', 'has_religious',
        'has_disability', 'has_sexual'
    ]
    topic_data = data[topic_cols].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
    cb_mask = (data['cyberbullying'] == 't').values
    topic_labels = topic_data.copy()
    topic_labels.loc[~cb_mask] = 0
    return topic_labels.values, topic_cols
    
def get_role_labels(data):
    role_map = {
        'bully': 0,
        'bully_assistant': 1,
        'non_aggressive_defender': 2,
        'non_aggressive_victim': 3,
        'passive_bystander': 4,
        'aggressive_defender': 5,
        'aggressive_victim': 6
    }
    role_labels = data['bullying_role'].map(role_map).fillna(0).astype(int)
    return role_labels.values
    
def save_basicml_metrics(y_true, y_pred, class_names, task_name, model_name, fold_num, auroc=None):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
        'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'f1_per_class': f1_score(y_true, y_pred, average=None, zero_division=0).tolist(),
        'precision_weighted': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall_weighted': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision_per_class': precision_score(y_true, y_pred, average=None, zero_division=0).tolist(),
        'recall_per_class': recall_score(y_true, y_pred, average=None, zero_division=0).tolist(),
    }
    if auroc is not None:
        metrics['auroc'] = auroc
    with open(f'{RESULTS_DIR}/{model_name}_{task_name}_metrics_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    return metrics
    
def save_multilabel_basicml_metrics(y_true, y_pred, topic_names, task_name, model_name, fold_num,
                                   f1_macro_labels=None, f1_micro=None,
                                   f1_weighted=None, auroc_macro=None, topic_metrics_list=None, overall_balanced_accuracy=None):
    metrics = {
        'overall_balanced_accuracy': overall_balanced_accuracy,
        'f1_macro_labels': f1_macro_labels,
        'f1_micro': f1_micro,
        'f1_weighted': f1_weighted,
    }
    if auroc_macro is not None:
        metrics['auroc_macro'] = auroc_macro
    if topic_metrics_list:
        metrics['per_topic_metrics'] = topic_metrics_list
    with open(f'{RESULTS_DIR}/{model_name}_{task_name}_metrics_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2, default=str)
    return metrics
    
def aggregate_fold_results(fold_results):
    if not fold_results:
        return {}
    aggregated = {}
    for metric_name in fold_results[0].keys():
        if metric_name not in ['per_topic_metrics', 'precision_per_class', 'recall_per_class', 'f1_per_class']:
            values = [fold.get(metric_name) for fold in fold_results if fold.get(metric_name) is not None]
            if values:
                aggregated[f'{metric_name}_mean'] = np.mean(values)
                aggregated[f'{metric_name}_std'] = np.std(values)
    return aggregated

## Task and Model Configuration

In [4]:
MODEL_CONFIG = {
    'LR': {
        'classifier': LogisticRegression(random_state=42, max_iter=1000),
        'name': 'LR',
        'enabled': True
    },
    'RF': {
        'classifier': RandomForestClassifier(n_estimators=100, random_state=42),
        'name': 'RF',
        'enabled': True
    },
    'SVM': {
        'classifier': SVC(random_state=42, probability=True),
        'name': 'SVM',
        'enabled': True
    },
    'NB': {
        'classifier': MultinomialNB(),
        'name': 'NB',
        'enabled': True
    }
}

TASK_CONFIG = {
    'binary': {
        'name': 'Binary Cyberbullying Classification',
        'task_key': 'binary',
        'enabled': True,
        'filter_cb_only': False,  
        'class_names': ['Non-CB', 'CB'],
        'metric_type': 'single_label'
    },
    'severity': {
        'name': 'Severity Classification',
        'task_key': 'severity',
        'enabled': True,
        'filter_cb_only': True, 
        'class_names': ['mild', 'moderate', 'severe'],
        'metric_type': 'single_label'
    },
    'topic': {
        'name': 'Topic Classification (Multi-label)',
        'task_key': 'topic',
        'enabled': True,
        'filter_cb_only': True,  
        'topic_names': ['has_race', 'has_political', 'has_intellectual', 'has_physical',
                       'has_social_status', 'has_gender', 'has_none', 'has_religious',
                       'has_disability', 'has_sexual'],
        'metric_type': 'multi_label',
        'use_multioutput': True
    },
    'role': {
        'name': 'Role Classification',
        'task_key': 'role',
        'enabled': True,
        'filter_cb_only': False,  
        'class_names': ['bully', 'bully_assistant', 'non_aggressive_defender', 'non_aggressive_victim',
                       'passive_bystander', 'aggressive_defender', 'aggressive_victim'],
        'metric_type': 'single_label'
    }
}
all_results = {task_key: defaultdict(list) for task_key in TASK_CONFIG.keys()}

In [None]:
for fold_num in range(1, NUM_FOLDS + 1):
    all_partitions = []
    for part_num in range(1, 6):
        filepath = os.path.join(DATA_DIR, FILE_PATTERN.format(part_num=part_num))
        partition = pd.read_csv(filepath)
        all_partitions.append(partition)
    test_data = all_partitions[fold_num - 1]
    train_data = pd.concat([all_partitions[i] for i in range(5) if i != (fold_num - 1)], ignore_index=True)
    data = pd.concat([train_data, test_data], ignore_index=True)
    train_idx = pd.Series([True] * len(train_data) + [False] * len(test_data))
    test_idx = pd.Series([False] * len(train_data) + [True] * len(test_data))
    target = get_target(data)
    text_features, vec = get_text_features(data)
    X_train_base = text_features[train_idx.values]
    X_test_base = text_features[test_idx.values]
    y_train_base = target[train_idx].values
    y_test_base = target[test_idx].values
    for task_key, task_config in TASK_CONFIG.items():
        if not task_config['enabled']:
            continue
        if task_key == 'binary':
            X_train = X_train_base
            X_test = X_test_base
            y_train = y_train_base
            y_test = y_test_base
        elif task_key == 'severity':
            severity_labels = get_severity_labels(data)
            cb_mask = (target == 1).values
            if cb_mask.sum() == 0:
                pass
                continue
            severity_train = severity_labels[train_idx.values][y_train_base == 1]
            severity_test = severity_labels[test_idx.values][y_test_base == 1]
            X_train_sev = X_train_base[y_train_base == 1]
            X_test_sev = X_test_base[y_test_base == 1]
            valid_train = severity_train > 0
            valid_test = severity_test > 0
            if valid_train.sum() == 0 or valid_test.sum() == 0:
                pass
                continue
            X_train = X_train_sev[valid_train]
            X_test = X_test_sev[valid_test]
            y_train = severity_train[valid_train]
            y_test = severity_test[valid_test]
        elif task_key == 'topic':
            topic_labels_ml, topic_cols = get_topic_labels_multilabel(data)
            cb_mask = (target == 1).values
            cb_train_idx = train_idx.values & cb_mask
            cb_test_idx = test_idx.values & cb_mask
            if cb_train_idx.sum() == 0 or cb_test_idx.sum() == 0:
                pass
                continue
            X_train = text_features[cb_train_idx]
            X_test = text_features[cb_test_idx]
            y_train = topic_labels_ml[cb_train_idx]
            y_test = topic_labels_ml[cb_test_idx]
        elif task_key == 'role':
            role_labels = get_role_labels(data)
            X_train = X_train_base
            X_test = X_test_base
            y_train = role_labels[train_idx.values]
            y_test = role_labels[test_idx.values]
        for model_key, model_config in MODEL_CONFIG.items():
            if not model_config['enabled']:
                continue
            model_name = model_config['name']
            if model_key == 'SVM':
                clf = SVC(random_state=42, kernel='linear', probability=True)
            else:
                clf = model_config['classifier']
            if task_config['metric_type'] == 'multi_label' and task_config.get('use_multioutput', False):
                clf = MultiOutputClassifier(clf)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            if task_config['metric_type'] == 'single_label':
                acc = accuracy_score(y_test, y_pred)
                bacc = balanced_accuracy_score(y_test, y_pred)
                f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
                f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
                f1_per_class = f1_score(y_test, y_pred, average=None, zero_division=0)
                try:
                    if hasattr(clf, "predict_proba"):
                        y_proba = clf.predict_proba(X_test)
                        n_classes = len(np.unique(y_test))
                        if n_classes == 2:
                            auroc = roc_auc_score(y_test, y_proba[:, 1])
                        else:
                            auroc_ovr = roc_auc_score(y_test, y_proba, average='macro', multi_class='ovr')
                            auroc_ovo = roc_auc_score(y_test, y_proba, average='macro', multi_class='ovo')
                            auroc = auroc_ovr
                    else:
                        auroc = None
                except Exception as e:
                    auroc = None
                metrics = save_basicml_metrics(
                    y_test, y_pred, task_config['class_names'],
                    f"{task_key}_classification", model_name.lower(), fold_num,
                    auroc=auroc
                )
                if auroc is not None:
                    pass
                else:
                    pass
                all_results[task_key][model_name].append(metrics)
            elif task_config['metric_type'] == 'multi_label':
                balanced_accuracies = []
                f1_scores_per_topic = []
                auroc_per_topic = []
                topic_metrics_list = []
                for i, topic_name in enumerate(task_config['topic_names']):
                    ba = balanced_accuracy_score(y_test[:, i], y_pred[:, i])
                    balanced_accuracies.append(ba)
                    f1 = f1_score(y_test[:, i], y_pred[:, i], average='binary', zero_division=0)
                    f1_scores_per_topic.append(f1)
                    p = precision_score(y_test[:, i], y_pred[:, i], average='binary', zero_division=0)
                    r = recall_score(y_test[:, i], y_pred[:, i], average='binary', zero_division=0)
                    auroc_topic = None
                    try:
                        if hasattr(clf.estimators_[i] if hasattr(clf, 'estimators_') else clf, "predict_proba"):
                            if hasattr(clf, 'estimators_'):
                                y_proba_topic = clf.estimators_[i].predict_proba(X_test)[:, 1]
                            else:
                                y_proba_topic = clf.predict_proba(X_test)[:, i]
                            auroc_topic = roc_auc_score(y_test[:, i], y_proba_topic)
                            auroc_per_topic.append(auroc_topic)
                    except:
                        pass
                    topic_metrics_list.append({
                        'topic': topic_name,
                        'f1': f1,
                        'precision': p,
                        'recall': r,
                        'auroc': auroc_topic,
                        'balanced_accuracy': ba
                    })
                overall_balanced_accuracy = np.mean(balanced_accuracies)
                f1_macro_labels = np.mean(f1_scores_per_topic)
                f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
                auroc_macro = np.mean(auroc_per_topic) if len(auroc_per_topic) > 0 else None
                metrics = save_multilabel_basicml_metrics(
                    y_test, y_pred, task_config['topic_names'],
                    f"{task_key}_classification", model_name.lower(), fold_num,
                                        f1_macro_labels=f1_macro_labels,
                    f1_micro=f1_micro,
                    f1_weighted=f1_weighted,
                    auroc_macro=auroc_macro,
                    topic_metrics_list=topic_metrics_list,
                    overall_balanced_accuracy=overall_balanced_accuracy
                )
                if auroc_macro is not None:
                    pass
                else:
                    pass
                all_results[task_key][model_name].append(metrics)
print("ALL FOLDS COMPLETED")

## Save Final Results

In [None]:
final_results = {}
for task_key, task_config in TASK_CONFIG.items():
    if not task_config['enabled']:
        continue
    final_results[f"{task_key}_classification"] = {}
    for model_key, model_config in MODEL_CONFIG.items():
        if not model_config['enabled']:
            continue
        model_name = model_config['name']
        if all_results[task_key][model_name]:
            final_results[f"{task_key}_classification"][model_name] = \
                aggregate_fold_results(all_results[task_key][model_name])
final_results['num_folds'] = NUM_FOLDS
final_results['timestamp'] = timestamp
output_path = f'{RESULTS_DIR}/aggregated_results.json'
with open(output_path, 'w') as f:
    json.dump(final_results, f, indent=2)
print(f"Results saved to: {output_path}")