In [64]:
import os
import numpy as np
import pandas as pd
import random
import ast 

In [79]:
# Get harm labels and our annotated harm labels
def load_dataset():
    annotations_file_path = os.path.join('..', 'WildGuard_Dataset', 'our_annotations.json')
    data = pd.read_json(annotations_file_path)
    # annotations = data['annotations']
    data = data.drop(columns=['file_upload', 'drafts', 'predictions', 'meta', 'created_at', 'updated_at', 'inner_id', 'cancelled_annotations', 'total_predictions', 'comment_count', 'unresolved_comment_count', 'last_comment_updated_at', 'project', 'updated_by', 'comment_authors'])

    return data

In [81]:
def get_harm_labels(data):

    rows = []
    for idx, task in data.iterrows():
        task_id = task['id']
        for ann in task['annotations']:
            harm_label = None
            for r in ann['result']:
                if r['from_name'] == 'harmful_class':
                    harm_label = r['value']['choices'][0]  # get the selected choice
            rows.append({
                'task_id': task_id,
                'annotation_id': ann['id'],
                'completed_by': ann['completed_by'],
                'harm_label': harm_label,
                'ground_truth': ann.get('ground_truth', None),
                'prediction': ann.get('prediction', None)
            })

    df_harm = pd.DataFrame(rows)

    return df_harm

In [112]:
def convert_harm_to_ints(data):
    harm_ints = []

    for idx, row in data.iterrows():
        if row['ground_truth'] == False:
            true_label = 0
        elif row['ground_truth'] == True:
            true_label = 1

        if row['harm_label'] == None:
            pred_label = -1
        elif "Safe" in row['harm_label']:
            pred_label = 0
        elif "Harmful" in row['harm_label']:
            pred_label = 1
        else:
            pred_label = -1

        if pred_label > -1:
            harm_ints.append((true_label, pred_label))

    return harm_ints

In [104]:
def get_phi_correlation(harms_list):
    df = pd.DataFrame(harms_list, columns=['true_harm', 'annotated_harm'])
    phi_auto = df.corr(method='pearson').iloc[0,1]
    print("Phi correlation (via pandas):", phi_auto)

In [116]:
data = load_dataset()
data_harm = get_harm_labels(data)

only_harm = convert_harm_to_ints(data_harm)
print(len(only_harm))

get_phi_correlation(only_harm)

1570
Phi correlation (via pandas): nan
