In [2]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix

In [3]:
with open("../data/output/labeled_sample_adam.jsonl", "r") as file:
    adas = [json.loads(line) for line in file]

with open("../data/output/labeled_sample_jan.jsonl", "r") as file:
    janeczek = [json.loads(line) for line in file]

In [20]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Wzmacnianie": "Strenghtening",
    "Osłabianie": "Weakening",
    "Neutral": "Neutral",
    "Hate": "Hate",
    "Strenghtening": "Strenghtening",
    "Weakening": "Weakening",
    "Odwracanie": "Reversing",
    "Reversing": "Reversing",
}

In [5]:

def cohen_kappa(confusion_matrix):
    """
    Calculate Cohen's Kappa based on a confusion matrix.

    Parameters:
    confusion_matrix (numpy array): A 2x2 confusion matrix where:
        confusion_matrix[0, 0] = a (both raters said Yes)
        confusion_matrix[0, 1] = b (rater 1 said Yes, rater 2 said No)
        confusion_matrix[1, 0] = c (rater 1 said No, rater 2 said Yes)
        confusion_matrix[1, 1] = d (both raters said No)

    Returns:
    float: Cohen's Kappa value
    """

    a = confusion_matrix[0, 0]
    b = confusion_matrix[0, 1]
    c = confusion_matrix[1, 0]
    d = confusion_matrix[1, 1]
    N = a + b + c + d

    p_o = (a + d) / N
    p_e = ((a + b) * (a + c) + (b + d) * (c + d)) / (N ** 2)
    kappa = (p_o - p_e) / (1 - p_e)

    return kappa


In [6]:
def fix_labels_adam(data):
    for anno in data:
        merged_labels = []
        anno['label'].sort(key=lambda x: x[0]) 
        
        for label in anno['label']:
            if not merged_labels:
                merged_labels.append(label)
            else:
                last_label = merged_labels[-1]
                if last_label[2] == label[2]:
                    last_label[1] = max(last_label[1], label[1])
                else:
                    merged_labels.append(label)

        anno['label'] = merged_labels
        for label in anno['label']:
            label[2] = mapping[label[2]]

    return data

In [7]:
adas = fix_labels_adam(adas)

In [8]:
adam_labels = [annotation["label"][0][2] for annotation in adas]
janek_labels = [annotation["entities"][0]["label"] for annotation in janeczek]

In [9]:
results = {
    ("Neutral", "Neutral"): 0,
    ("Hate", "Neutral"): 0,
    ("Neutral", "Hate"): 0,
    ("Hate", "Hate"): 0,
}

for adam, janek in zip(adam_labels, janek_labels):
    results[(adam, janek)] += 1

In [10]:
results

{('Neutral', 'Neutral'): 89,
 ('Hate', 'Neutral'): 4,
 ('Neutral', 'Hate'): 1,
 ('Hate', 'Hate'): 6}

In [11]:
matrix = np.array([[89, 4], [1, 6]])

In [12]:
cohen_kappa(matrix)

0.6794871794871793

In [24]:
with open("../data/output/labeled_sample_adam_2.jsonl", "r") as file:
    adas = [json.loads(line) for line in file]

with open("../data/output/labeled_sample_jan_2_fix.jsonl", "r") as file:
    janeczek = [json.loads(line) for line in file]

In [40]:
adas = fix_labels_adam(adas)
adam_labels = [annotation["label"][0][2] for annotation in adas]
janek_labels = [annotation["label"][0][2] for annotation in janeczek]
janek_labels[27] = "Neutralny"
results = {
    ("Neutral", "Neutral"): 0,
    ("Hate", "Neutral"): 0,
    ("Neutral", "Hate"): 0,
    ("Hate", "Hate"): 0,
}

for adam, janek in zip(adam_labels, janek_labels):
    results[(adam, mapping[janek])] += 1
results

{('Neutral', 'Neutral'): 86,
 ('Hate', 'Neutral'): 1,
 ('Neutral', 'Hate'): 3,
 ('Hate', 'Hate'): 10}

In [43]:
matrix = np.array([[86, 1], [3, 10]])
cohen_kappa(matrix)

0.8107852412488172