In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

results_dir = 'results/basicml'
os.makedirs(results_dir, exist_ok=True)

In [None]:
fold_num = 2
train_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_train.csv')
test_data = pd.read_csv(f'data_splits/splits/fold_{fold_num}_test.csv')

train_data = train_data.replace({'f': 0, 't': 1})
test_data = test_data.replace({'f': 0, 't': 1})

data = pd.concat([train_data, test_data], ignore_index=True)
train_idx = pd.Series([True] * len(train_data) + [False] * len(test_data))
test_idx = pd.Series([False] * len(train_data) + [True] * len(test_data))

print(f"Using fold {fold_num} - Train: {len(train_data)}, Test: {len(test_data)}, Total: {len(data)}")
data.shape

In [None]:
def get_target(data):
    return data['c_cyberbullying_majority'].astype(int)

def get_text_features(data):
    comments = data['c_comment_content'].fillna('').astype(str)
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    features = vectorizer.fit_transform(comments)
    return features, vectorizer

def create_severity_labels(data):
    severity_cols = ['c_severity_mild_count', 'c_severity_moderate_count', 'c_severity_severe_count']
    severity_counts = data[severity_cols].fillna(0)
    
    cb_mask = data['c_cyberbullying_majority'] == 1
    severity_labels = np.zeros(len(data))
    
    for idx in data.index:
        if not cb_mask.iloc[idx]:
            severity_labels[idx] = 0
        else:
            row = severity_counts.iloc[idx]
            if row.sum() == 0:
                severity_labels[idx] = 0
            else:
                max_idx = row.argmax()
                severity_labels[idx] = max_idx + 1
    
    return severity_labels.astype(int)

def save_basicml_metrics(y_true, y_pred, y_proba, class_names, task_name, model_name):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    
    metrics['precision_macro'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['recall_macro'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    metrics['precision_weighted'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['recall_weighted'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    metrics['precision_per_class'] = precision_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['recall_per_class'] = recall_score(y_true, y_pred, average=None, zero_division=0).tolist()
    metrics['f1_per_class'] = f1_score(y_true, y_pred, average=None, zero_division=0).tolist()
    
    if len(np.unique(y_true)) == 2:
        if y_proba is not None and len(y_proba.shape) > 1 and y_proba.shape[1] >= 2:
            metrics['auroc'] = roc_auc_score(y_true, y_proba[:, 1])
        else:
            metrics['auroc'] = None
    else:
        if y_proba is not None and len(y_proba.shape) > 1:
            try:
                metrics['auroc_macro'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
                metrics['auroc_weighted'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
            except:
                metrics['auroc_macro'] = None
                metrics['auroc_weighted'] = None
        else:
            metrics['auroc_macro'] = None
            metrics['auroc_weighted'] = None
    
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'{model_name.upper()} - {task_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'{results_dir}/{model_name}_{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    with open(f'{results_dir}/{model_name}_{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    np.save(f'{results_dir}/{model_name}_{task_name}_confusion_matrix_{timestamp}_fold_{fold_num}.npy', cm)
    
    return metrics

def save_multilabel_basicml_metrics(y_true, y_pred, y_proba, topic_names, task_name, model_name):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    metrics = {}
    
    topic_metrics = []
    for i, topic_name in enumerate(topic_names):
        if y_true[:, i].sum() > 0:
            p, r, f1, _ = precision_recall_fscore_support(
                y_true[:, i], y_pred[:, i], average='binary', zero_division=0
            )
            support = y_true[:, i].sum()
            
            try:
                if y_proba is not None and len(y_proba.shape) > 1:
                    auroc = roc_auc_score(y_true[:, i], y_proba[:, i])
                else:
                    auroc = None
            except:
                auroc = None
                
            topic_metrics.append({
                'topic': topic_name,
                'precision': p,
                'recall': r,
                'f1': f1,
                'auroc': auroc,
                'support': support
            })
    
    subset_accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    
    if topic_metrics:
        precisions = [m['precision'] for m in topic_metrics]
        recalls = [m['recall'] for m in topic_metrics]
        f1s = [m['f1'] for m in topic_metrics]
        aurocs = [m['auroc'] for m in topic_metrics if m['auroc'] is not None]
        
        metrics['subset_accuracy'] = subset_accuracy
        metrics['macro_precision'] = np.mean(precisions)
        metrics['macro_recall'] = np.mean(recalls)
        metrics['macro_f1'] = np.mean(f1s)
        if aurocs:
            metrics['macro_auroc'] = np.mean(aurocs)
        
        metrics['per_topic_metrics'] = topic_metrics
        
        with open(f'{results_dir}/{model_name}_{task_name}_metrics_{timestamp}_fold_{fold_num}.json', 'w') as f:
            json.dump(metrics, f, indent=2, default=str)
    
    return metrics

In [None]:
target = get_target(data)
text_features, vec = get_text_features(data)

X_train = text_features[train_idx.values]
X_test = text_features[test_idx.values]
y_train = target[train_idx].values
y_test = target[test_idx].values

f"Data: {data.shape}, Train: {train_idx.sum()}, Test: {test_idx.sum()}"

In [None]:
classifiers = {
    'LR': LogisticRegression(random_state=42, max_iter=1000),
    'RF': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'NB': MultinomialNB()
}

print("BINARY CYBERBULLYING CLASSIFICATION:")
print("=" * 50)
binary_results = []
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    if hasattr(clf, 'predict_proba'):
        pred_proba = clf.predict_proba(X_test)
    else:
        pred_proba = None
    
    acc = accuracy_score(y_test, pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, pred, average='binary')
    
    binary_results.append({'Model': name, 'Acc': acc, 'F1': f1})
    
    metrics = save_basicml_metrics(
        y_test, pred, pred_proba, 
        ['Non-CB', 'CB'], 
        'binary_classification', 
        name.lower()
    )
    
    print(f"\n{name}: Acc={acc:.3f}, F1={f1:.3f}")
    if metrics.get('auroc'):
        print(f"      AUROC={metrics['auroc']:.3f}")
    print(classification_report(y_test, pred, target_names=['Non-CB', 'CB']))

In [None]:
severity_labels = create_severity_labels(data)

severity_names = {0: 'none', 1: 'mild', 2: 'moderate', 3: 'severe'}
for level, name in severity_names.items():
    count = (severity_labels == level).sum()
    print(f"  {level} ({name}): {count}")

In [None]:
cb_mask = (target == 1).values
cb_train_idx = train_idx.values & cb_mask
cb_test_idx = test_idx.values & cb_mask

if cb_train_idx.sum() > 0 and cb_test_idx.sum() > 0:
    sev_train_cb = severity_labels[cb_train_idx] - 1
    sev_test_cb = severity_labels[cb_test_idx] - 1
    
    valid_train_mask = sev_train_cb >= 0
    valid_test_mask = sev_test_cb >= 0
    
    X_train_sev = text_features[cb_train_idx][valid_train_mask]
    X_test_sev = text_features[cb_test_idx][valid_test_mask]
    y_train_sev = sev_train_cb[valid_train_mask]
    y_test_sev = sev_test_cb[valid_test_mask]
    
    severity_names_fixed = ['mild', 'moderate', 'severe']
    
    for i, name in enumerate(severity_names_fixed):
        train_count = (y_train_sev == i).sum()
        test_count = (y_test_sev == i).sum()
        print(f"  {i} ({name}): Train={train_count}, Test={test_count}")
    
    severity_results = []
    for name, clf in classifiers.items():
        if name == 'SVM':
            clf = SVC(random_state=42, kernel='linear', probability=True)
        else:
            clf = classifiers[name]
        
        clf.fit(X_train_sev, y_train_sev)
        pred_s = clf.predict(X_test_sev)
        
        if hasattr(clf, 'predict_proba'):
            pred_proba_s = clf.predict_proba(X_test_sev)
        else:
            pred_proba_s = None
        
        acc_s = accuracy_score(y_test_sev, pred_s)
        p_s, r_s, f1_s, _ = precision_recall_fscore_support(y_test_sev, pred_s, average='weighted')
        
        severity_results.append({'Model': name, 'Acc': acc_s, 'F1': f1_s})
        
        metrics_s = save_basicml_metrics(
            y_test_sev, pred_s, pred_proba_s,
            severity_names_fixed,
            'severity_classification',
            name.lower()
        )
        
        print(f"{name}: Acc={acc_s:.3f}, F1={f1_s:.3f}", end="")
        if metrics_s.get('auroc_weighted'):
            print(f", AUROC={metrics_s['auroc_weighted']:.3f}")
        else:
            print()
else:
    severity_results = []

In [None]:
def create_topic_labels_multilabel(data):
    topic_cols = ['c_topic_disability_majority', 'c_topic_gender_majority', 'c_topic_intellectual_majority',
                  'c_topic_other_majority', 'c_topic_physical_majority', 'c_topic_political_majority',
                  'c_topic_race_majority', 'c_topic_religious_majority', 'c_topic_sexual_majority',
                  'c_topic_social_status_majority']
    
    topic_data = data[topic_cols].fillna(0).astype(int)
    cb_mask = data['c_cyberbullying_majority'] == 1
    
    topic_labels = topic_data.copy()
    topic_labels[~cb_mask] = 0
    
    return topic_labels.values, topic_cols

topic_labels_ml, topic_cols = create_topic_labels_multilabel(data)
topic_names = ['disability', 'gender', 'intellectual', 'other', 'physical', 
               'political', 'race', 'religious', 'sexual', 'social_status']

cb_mask = (target == 1).values
cb_topic_labels = topic_labels_ml[cb_mask]
for i, name in enumerate(topic_names):
    count = cb_topic_labels[:, i].sum()
    print(f"  {name}: {count}")

print(f"Total CB: {cb_mask.sum()}, With topics: {(cb_topic_labels.sum(axis=1) > 0).sum()}, Multi-topic: {(cb_topic_labels.sum(axis=1) > 1).sum()}")

In [None]:
cb_mask = (target == 1).values
cb_train_idx = train_idx.values & cb_mask
cb_test_idx = test_idx.values & cb_mask

if cb_train_idx.sum() > 0 and cb_test_idx.sum() > 0:
    X_train_topic = text_features[cb_train_idx]
    X_test_topic = text_features[cb_test_idx]
    y_train_topic = topic_labels_ml[cb_train_idx]
    y_test_topic = topic_labels_ml[cb_test_idx]
    
    print(f"Training: {X_train_topic.shape[0]} CB comments, Testing: {X_test_topic.shape[0]} CB comments")
    
    from sklearn.multioutput import MultiOutputClassifier
    from sklearn.metrics import hamming_loss, jaccard_score
    
    topic_results = []
    
    for name, base_clf in classifiers.items():
        if name == 'SVM':
            base_clf = SVC(random_state=42, kernel='linear', probability=True)
        else:
            base_clf = classifiers[name]
        
        clf = MultiOutputClassifier(base_clf)
        clf.fit(X_train_topic, y_train_topic)
        pred_t = clf.predict(X_test_topic)
        
        pred_proba_t = None
        if hasattr(base_clf, 'predict_proba'):
            try:
                pred_proba_t = clf.predict_proba(X_test_topic)
                pred_proba_t = np.column_stack([proba[:, 1] for proba in pred_proba_t])
            except:
                pred_proba_t = None
        
        subset_accuracy = (pred_t == y_test_topic).all(axis=1).mean()
        hamming_loss_score = hamming_loss(y_test_topic, pred_t)
        jaccard_avg = jaccard_score(y_test_topic, pred_t, average='samples', zero_division=0)
        
        topic_metrics = []
        for i, topic_name in enumerate(topic_names):
            if y_test_topic[:, i].sum() > 0:
                p, r, f1, _ = precision_recall_fscore_support(
                    y_test_topic[:, i], pred_t[:, i], average='binary', zero_division=0
                )
                topic_metrics.append({'topic': topic_name, 'precision': p, 'recall': r, 'f1': f1})
        
        if topic_metrics:
            macro_precision = np.mean([m['precision'] for m in topic_metrics])
            macro_recall = np.mean([m['recall'] for m in topic_metrics])
            macro_f1 = np.mean([m['f1'] for m in topic_metrics])
        else:
            macro_precision = macro_recall = macro_f1 = 0.0
        
        topic_results.append({
            'Model': name, 
            'Subset_Acc': subset_accuracy, 
            'Macro_F1': macro_f1,
            'Jaccard': jaccard_avg,
            'Hamming_Loss': hamming_loss_score
        })
        
        metrics_t = save_multilabel_basicml_metrics(
            y_test_topic, pred_t, pred_proba_t,
            topic_names,
            'topic_classification',
            name.lower()
        )
        
        print(f"{name}: SubsetAcc={subset_accuracy:.3f}, MacroF1={macro_f1:.3f}, Jaccard={jaccard_avg:.3f}")
else:
    topic_results = []

In [None]:
def create_role_labels(data):
    cb_roles = ['c_role_bully_count', 'c_role_cb__bully_assistant_count', 
                'c_role_cb_aggressive_victim_role_count', 'c_role_cb_aggressive_defender_count']
    noncb_roles = ['c_role_noncb_passive_bystander_count', 'c_role_noncb_non_aggressive_victim_count',
                   'c_role_noncb_non_aggressive_defender_count']
    
    role_labels = []
    cb_mask = data['c_cyberbullying_majority'] == 1
    
    for idx in data.index:
        if cb_mask.iloc[idx]:
            role_counts = data[cb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role)
            else:
                role_labels.append(0)
        else:
            role_counts = data[noncb_roles].iloc[idx]
            if role_counts.sum() > 0:
                max_role = role_counts.argmax()
                role_labels.append(max_role + 4)
            else:
                role_labels.append(4)
    
    return np.array(role_labels)

role_labels = create_role_labels(data)
role_names = {0: 'bully', 1: 'bully_assistant', 2: 'aggressive_victim', 3: 'aggressive_defender',
              4: 'passive_bystander', 5: 'non_aggressive_victim', 6: 'non_aggressive_defender'}

for role_id, name in role_names.items():
    count = (role_labels == role_id).sum()
    print(f"  {role_id} ({name}): {count}")

In [None]:
role_train = role_labels[train_idx.values]
role_test = role_labels[test_idx.values]

role_results = []
for name, clf in classifiers.items():
    if name == 'SVM':
        clf = SVC(random_state=42, kernel='linear', probability=True)
    else:
        clf = classifiers[name]
    
    clf.fit(X_train, role_train)
    pred_r = clf.predict(X_test)
    
    if hasattr(clf, 'predict_proba'):
        pred_proba_r = clf.predict_proba(X_test)
    else:
        pred_proba_r = None
    
    acc_r = accuracy_score(role_test, pred_r)
    p_r, r_r, f1_r, _ = precision_recall_fscore_support(role_test, pred_r, average='weighted', zero_division=0)
    
    role_results.append({'Model': name, 'Acc': acc_r, 'F1': f1_r})
    
    role_names_list = ['bully', 'bully_assistant', 'aggressive_victim', 'aggressive_defender',
                      'passive_bystander', 'non_aggressive_victim', 'non_aggressive_defender']
    
    metrics_r = save_basicml_metrics(
        role_test, pred_r, pred_proba_r,
        role_names_list,
        'role_classification',
        name.lower()
    )
    
    print(f"{name}: Acc={acc_r:.3f}, F1={f1_r:.3f}", end="")
    if metrics_r.get('auroc_weighted'):
        print(f", AUROC={metrics_r['auroc_weighted']:.3f}")
    else:
        print()

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

feature_names = vec.get_feature_names_out()
importance_scores = rf_model.feature_importances_

top_indices = np.argsort(importance_scores)[-20:]
top_features = [(feature_names[i], importance_scores[i]) for i in top_indices]

print("Top 20 Features for Cyberbullying Detection:")
for i, (feature, importance) in enumerate(reversed(top_features), 1):
    print(f"{i:2d}. {feature:20s}: {importance:.4f}")

In [None]:
results_df = pd.DataFrame(binary_results)
best_binary = results_df.loc[results_df['F1'].idxmax()]
print(f"Binary Best: {best_binary['Model']} (F1={best_binary['F1']:.3f}, Acc={best_binary['Acc']:.3f})")

if severity_results:
    severity_df = pd.DataFrame(severity_results)
    best_severity = severity_df.loc[severity_df['F1'].idxmax()]
    print(f"Severity Best: {best_severity['Model']} (F1={best_severity['F1']:.3f}, Acc={best_severity['Acc']:.3f})")

if topic_results:
    topic_df = pd.DataFrame(topic_results)
    best_topic = topic_df.loc[topic_df['Macro_F1'].idxmax()]
    print(f"Topic Best: {best_topic['Model']} (MacroF1={best_topic['Macro_F1']:.3f}, SubsetAcc={best_topic['Subset_Acc']:.3f})")

role_df = pd.DataFrame(role_results)
best_role = role_df.loc[role_df['F1'].idxmax()]
print(f"Role Best: {best_role['Model']} (F1={best_role['F1']:.3f}, Acc={best_role['Acc']:.3f})")

print(f"Dataset: {len(data)} samples, Train/Test: {train_idx.sum()}/{test_idx.sum()}")
print(f"CB prevalence: {(target==1).mean():.1%}, Sessions: {data['s_unit_id'].nunique()}")

cb_mask = (target == 1).values
cb_train = (train_idx.values & cb_mask).sum()
cb_test = (test_idx.values & cb_mask).sum()
print(f"CB Train/Test: {cb_train}/{cb_test}")