In [1]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix

In [2]:
with open("../data/output/labeled_sample_adam.jsonl", "r") as file:
    adas = [json.loads(line) for line in file]

with open("../data/output/labeled_sample_jan.jsonl", "r") as file:
    janeczek = [json.loads(line) for line in file]

In [3]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Wzmacnianie": "Strenghtening",
    "Osłabianie": "Weakening",
    "Neutral": "Neutral",
    "Hate": "Hate",
    "Strenghtening": "Strenghtening",
    "Weakening": "Weakening",
    "Odwracanie": "Reversing",
    "Reversing": "Reversing",
}

In [4]:

def cohen_kappa(confusion_matrix):
    """
    Calculate Cohen's Kappa based on a confusion matrix.

    Parameters:
    confusion_matrix (numpy array): A 2x2 confusion matrix where:
        confusion_matrix[0, 0] = a (both raters said Yes)
        confusion_matrix[0, 1] = b (rater 1 said Yes, rater 2 said No)
        confusion_matrix[1, 0] = c (rater 1 said No, rater 2 said Yes)
        confusion_matrix[1, 1] = d (both raters said No)

    Returns:
    float: Cohen's Kappa value
    """

    a = confusion_matrix[0, 0]
    b = confusion_matrix[0, 1]
    c = confusion_matrix[1, 0]
    d = confusion_matrix[1, 1]
    N = a + b + c + d

    p_o = (a + d) / N
    p_e = ((a + b) * (a + c) + (b + d) * (c + d)) / (N ** 2)
    kappa = (p_o - p_e) / (1 - p_e)

    return kappa


In [5]:
def fix_labels_adam(data):
    for anno in data:
        merged_labels = []
        anno['label'].sort(key=lambda x: x[0]) 
        
        for label in anno['label']:
            if not merged_labels:
                merged_labels.append(label)
            else:
                last_label = merged_labels[-1]
                if last_label[2] == label[2]:
                    last_label[1] = max(last_label[1], label[1])
                else:
                    merged_labels.append(label)

        anno['label'] = merged_labels
        for label in anno['label']:
            label[2] = mapping[label[2]]

    return data

In [6]:
adas = fix_labels_adam(adas)

In [23]:
adas_words = []
for line in adas:
    text = line["text"]
    for label in line["label"]:
        if label[2] not in ["Neutral", "Hate"]:
            start, stop, type_ = label
            adas_words.append({"word": text[start: stop], "type": type_})

In [30]:
janeczek_words = []
for line in janeczek:
    text = line["text"]
    for label in line["entities"]:
        if label["label"] not in ["Neutral", "Hate"]:
            start, stop, type_ = label["start_offset"], label["end_offset"], label["label"]
            janeczek_words.append({"word": text[start: stop], "type": type_})

In [24]:
adas_words

[{'word': 'na głowę już zbyt późno', 'type': 'Strenghtening'},
 {'word': 'debilu', 'type': 'Strenghtening'},
 {'word': 'debile', 'type': 'Strenghtening'},
 {'word': 'pajacu', 'type': 'Strenghtening'},
 {'word': 'zakłamanym oszustem', 'type': 'Strenghtening'},
 {'word': 'Jebana menda.', 'type': 'Strenghtening'},
 {'word': 'wkurwia', 'type': 'Strenghtening'},
 {'word': 'kłamco', 'type': 'Strenghtening'},
 {'word': 'swastykę na ryju?', 'type': 'Strenghtening'},
 {'word': 'ale jakaś kultura obowiązuje', 'type': 'Weakening'},
 {'word': 'To proszę', 'type': 'Weakening'},
 {'word': 'trzeba mieć niepokoleji w głowie', 'type': 'Strenghtening'},
 {'word': '😁😀😀😀', 'type': 'Reversing'}]

In [31]:
janeczek_words

[{'word': ' Lecz się', 'type': 'Strenghtening'},
 {'word': 'na głowę już zbyt późno', 'type': 'Strenghtening'},
 {'word': 'debilu', 'type': 'Strenghtening'},
 {'word': 'debile', 'type': 'Strenghtening'},
 {'word': 'pajacu', 'type': 'Strenghtening'},
 {'word': 'Jebana menda.', 'type': 'Strenghtening'},
 {'word': 'Zdrajca i faszysta', 'type': 'Strenghtening'},
 {'word': 'wkurwia', 'type': 'Strenghtening'},
 {'word': 'ryju?', 'type': 'Strenghtening'},
 {'word': 'ZAJEBISCIE', 'type': 'Strenghtening'}]

In [33]:
import pandas as pd

In [52]:
df = pd.DataFrame(
    {
     "Phrase": ["na głowę już zbyt późno", "debilu", "debilu", "pajacu", "Jebana menda", "zakłamanym oszustem", "Zdrajca i fasztysta", "wkurwia", "ryju", "To proszę", "trzeba mieć niepokoleji w głowie", "😁😀😀😀", "ZAJEBISCIE", "kłamco", "ale jakaś kultura obowiązuje"],
     "Adas": ["Strenghtening", "Strenghtening", "Strenghtening", "Strenghtening", "Strenghtening", "Strenghtening", None, None, "Strenghtening", "Weakening", "Strenghtening", "Reversing", "Strenghtening", "Strenghtening", "Weakening"],
     "Janek": ["Strenghtening", "Strenghtening", "Strenghtening", "Strenghtening", "Strenghtening", None, "Strenghtening", "Strenghtening", "Strenghtening", None, None, None, "Strenghtening", None, None]
     }
)

df

Unnamed: 0,Phrase,Adas,Janek
0,na głowę już zbyt późno,Strenghtening,Strenghtening
1,debilu,Strenghtening,Strenghtening
2,debilu,Strenghtening,Strenghtening
3,pajacu,Strenghtening,Strenghtening
4,Jebana menda,Strenghtening,Strenghtening
5,zakłamanym oszustem,Strenghtening,
6,Zdrajca i fasztysta,,Strenghtening
7,wkurwia,,Strenghtening
8,ryju,Strenghtening,Strenghtening
9,To proszę,Weakening,


In [127]:
matrix = np.array([
    [7, 3],
    [3, 0]
])

In [128]:
cohen_kappa(matrix)

-0.29999999999999993

In [92]:
with open("../data/output/labeled_sample_adam_2.jsonl", "r") as file:
    adas = [json.loads(line) for line in file]

with open("../data/output/labeled_sample_jan_2.jsonl", "r") as file:
    janeczek = [json.loads(line) for line in file]

In [93]:
adas_words = []
for line in adas:
    text = line["text"]
    for label in line["label"]:
        if label[2] not in ["Neutralny", "Mowa nienawiści"]:
            start, stop, type_ = label
            adas_words.append({"word": text[start: stop], "type": type_})

In [94]:
janeczek_words = []
for line in janeczek:
    text = line["text"]
    for label in line["entities"]:
        if label["label"] not in ["Neutralny", "Mowa nienawiści"]:
            start, stop, type_ = label["start_offset"], label["end_offset"], label["label"]
            janeczek_words.append({"word": text[start: stop], "type": type_})

In [120]:
adas_words

[{'word': 'głupia cipo.', 'type': 'Wzmacnianie'},
 {'word': 'Kurwa', 'type': 'Wzmacnianie'},
 {'word': 'jebany', 'type': 'Wzmacnianie'},
 {'word': 'kurwa', 'type': 'Wzmacnianie'},
 {'word': 'gnoju', 'type': 'Wzmacnianie'},
 {'word': 'A taki był porządny program', 'type': 'Osłabianie'},
 {'word': 'ja też spierdalam', 'type': 'Odwracanie'},
 {'word': 'polaku złoty ptaku', 'type': 'Osłabianie'},
 {'word': 'Tak jak Ty i Tobie podobni.', 'type': 'Wzmacnianie'},
 {'word': 'pajacu', 'type': 'Wzmacnianie'},
 {'word': 'Taki pospolity', 'type': 'Wzmacnianie'},
 {'word': 'ćwierćmózg', 'type': 'Wzmacnianie'},
 {'word': 'Kolesiostwo pierdolone.', 'type': 'Wzmacnianie'},
 {'word': 'największych debili ', 'type': 'Wzmacnianie'},
 {'word': 'zjeby', 'type': 'Wzmacnianie'}]

In [121]:
len(adas_words)

15

In [122]:
janeczek_words

[{'word': 'głupia cipo.', 'type': 'Wzmacnianie'},
 {'word': 'Kurwa ', 'type': 'Wzmacnianie'},
 {'word': 'jebany', 'type': 'Wzmacnianie'},
 {'word': 'kurwa', 'type': 'Wzmacnianie'},
 {'word': 'gnoju?', 'type': 'Wzmacnianie'},
 {'word': 'ja też spierdalam', 'type': 'Odwracanie'},
 {'word': 'pisdzielstwo', 'type': 'Wzmacnianie'},
 {'word': 'parchu', 'type': 'Wzmacnianie'},
 {'word': 'pajacu', 'type': 'Wzmacnianie'},
 {'word': 'ćwierćmózg', 'type': 'Wzmacnianie'},
 {'word': 'Parchu', 'type': 'Wzmacnianie'},
 {'word': 'Kolesiostwo pierdolone.', 'type': 'Wzmacnianie'},
 {'word': ' Lecz się', 'type': 'Wzmacnianie'},
 {'word': 'debili', 'type': 'Wzmacnianie'},
 {'word': 'zjeby', 'type': 'Wzmacnianie'},
 {'word': 'szczyj na nich', 'type': 'Wzmacnianie'}]

In [123]:
len(janeczek_words)

16

In [124]:
df = pd.DataFrame(
    {
     "Phrase": ['głupia cipo', 'Kurwa', 'jebany', 'kurwa', 'gnoju', 'A taki był porządny program', 'ja też spierdalam', 'polaku złoty ptaku', 'Tak jak Ty i Tobie podobni.', 'pajacu', 'ćwierćmózg', 'Kolesiostwo pierdolone.', 'debili ', 'zjeby', 'pisdzielstwo', 'parchu', 'szczyj na nich', 'Parchu', 'Lecz się'],
     "Adas": ['Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Osłabianie', 'Odwracanie', 'Osłabianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', None, None, None, None, None],
     "Janek": ['Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', None, 'Odwracanie', None, None, 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie', 'Wzmacnianie'],
     
     }
)

df

Unnamed: 0,Phrase,Adas,Janek
0,głupia cipo,Wzmacnianie,Wzmacnianie
1,Kurwa,Wzmacnianie,Wzmacnianie
2,jebany,Wzmacnianie,Wzmacnianie
3,kurwa,Wzmacnianie,Wzmacnianie
4,gnoju,Wzmacnianie,Wzmacnianie
5,A taki był porządny program,Osłabianie,
6,ja też spierdalam,Odwracanie,Odwracanie
7,polaku złoty ptaku,Osłabianie,
8,Tak jak Ty i Tobie podobni.,Wzmacnianie,
9,pajacu,Wzmacnianie,Wzmacnianie
