In [1]:
import json
import numpy as np

def load_annotations(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = [json.loads(line) for line in file]
    return data

lukasz_file = fr"../data/output/labeled_sample_lukasz_3.jsonl"
adam_file = fr"../data/output/labeled_sample_adam_3.jsonl"
jan_file = fr"../annotations_jan/labeled_sample_jan_3.jsonl"

lukasz_annotations = load_annotations(lukasz_file)
adam_annotations = load_annotations(adam_file)
jan_annotations = load_annotations(jan_file)

def compute_statistics(annotations):
    num_entries = len(annotations)
    lengths = [len(annotation['text'].split()) for annotation in annotations]
    labels = [entity['label'] for annotation in annotations if 'entities' in annotation for entity in annotation['entities']]
    neutral_count = labels.count("Neutralny")
    hate_count = labels.count("Mowa nienawiści")
    mean_length = np.mean(lengths)
    std_length = np.std(lengths)
    min_length = np.min(lengths)
    max_length = np.max(lengths)
    median_length = np.median(lengths)
    total_labels = len(labels)
    if total_labels > 0:
        neutral_percentage = (neutral_count / total_labels) * 100
        hate_percentage = (hate_count / total_labels) * 100
    else:
        neutral_percentage = 0
        hate_percentage = 0
    multilabel_count = sum(1 for annotation in annotations if 'entities' in annotation and len(annotation['entities']) > 1)
    return {
        "num_entries": num_entries,
        "mean_length": mean_length,
        "std_length": std_length,
        "min_length": min_length,
        "max_length": max_length,
        "median_length": median_length,
        "neutral_count": neutral_count,
        "hate_count": hate_count,
        "neutral_percentage": neutral_percentage,
        "hate_percentage": hate_percentage,
        "multilabel_count": multilabel_count
    }

lukasz_stats = compute_statistics(lukasz_annotations)
adam_stats = compute_statistics(adam_annotations)
jan_stats = compute_statistics(jan_annotations)

print("Statystyki dla anotacji Łukasza:")
print(lukasz_stats)

print("\nStatystyki dla anotacji Adama:")
print(adam_stats)

print("\nStatystyki dla anotacji Jana:")
print(jan_stats)

Statystyki dla anotacji Łukasza:
{'num_entries': 100, 'mean_length': 12.66, 'std_length': 4.373145321161875, 'min_length': 4, 'max_length': 22, 'median_length': 12.5, 'neutral_count': 0, 'hate_count': 0, 'neutral_percentage': 0, 'hate_percentage': 0, 'multilabel_count': 0}

Statystyki dla anotacji Adama:
{'num_entries': 49, 'mean_length': 12.387755102040817, 'std_length': 4.443827760446481, 'min_length': 6, 'max_length': 23, 'median_length': 12.0, 'neutral_count': 0, 'hate_count': 0, 'neutral_percentage': 0, 'hate_percentage': 0, 'multilabel_count': 0}

Statystyki dla anotacji Jana:
{'num_entries': 50, 'mean_length': 12.44, 'std_length': 5.575517913162866, 'min_length': 6, 'max_length': 25, 'median_length': 11.0, 'neutral_count': 45, 'hate_count': 5, 'neutral_percentage': 83.33333333333334, 'hate_percentage': 9.25925925925926, 'multilabel_count': 3}
