In [None]:
import collections
import json

import krippendorff
import numpy as np
from pydantic import BaseModel
from rapidfuzz import fuzz
from sklearn.metrics import cohen_kappa_score

from evaluation.utils import get_annotations, suggestions_are_same
from evaluation.utils import get_experiment_infos

In [None]:
class StopwatchSuggestion(BaseModel):
    time: float  # the time the annotation happened
    matches: list[str]  # the list of strings the suggestion must match to satisfy this label
    antimatches: list[str] | None = None  # a list of strings that cannot match


all_stopwatch_annotations = {}
with open("gold-stopwatch.json") as f:
    stopwatch_annotation_data = json.load(f)
    for experiment_id, stopwatch_annotations in stopwatch_annotation_data.items():
        all_stopwatch_annotations[experiment_id] = [
            StopwatchSuggestion.model_validate(a) for a in stopwatch_annotations
        ]

experiments = get_experiment_infos()

## human annotator stuff

In [None]:
def get_annotation_index(experiment_id):
    annotations = list(get_annotations(experiment_id))
    annotations_by_suggestion_id = collections.defaultdict(list)

    # go through the human annotations and build up an index
    for annotation in annotations:
        annotations_by_suggestion_id[annotation.suggestion_id].append(annotation)

    # remove all suggestions that are only copies
    for suggestion_id, suggestion_annotations in annotations_by_suggestion_id.copy().items():
        if all("copied" in a.why for a in suggestion_annotations):
            annotations_by_suggestion_id.pop(suggestion_id)

    return annotations_by_suggestion_id


# annotations_by_suggestion_id

### IAA

In [None]:
# calculate IAA with cohen kappa
y1 = []
y2 = []
for experiment in experiments:
    for suggestion_id, suggestion_annotations in get_annotation_index(experiment.id).items():
        if len(suggestion_annotations) == 2:
            y1.append(suggestion_annotations[0].score > 0)
            y2.append(suggestion_annotations[1].score > 0)
cohen_kappa_score(y1, y2)

In [None]:
sum(1 for x, y in zip(y1, y2) if x != y)

In [None]:
# krippendorf alpha
annotators = ["osowande", "akeri13", "jludan", "osgoodev", "feshbach", "pdotsamp", "andrz"]
seen_suggestion_ids = []
annotation_count = sum(len(list(get_annotations(experiment.id))) for experiment in experiments)
reliability_data = np.empty((len(annotators), annotation_count))
reliability_data.fill(np.nan)

for experiment in experiments:
    for annotation in get_annotations(experiment.id):
        if annotation.who not in annotators:
            continue
        if annotation.suggestion_id not in seen_suggestion_ids:
            seen_suggestion_ids.append(annotation.suggestion_id)
        row_idx = seen_suggestion_ids.index(annotation.suggestion_id)
        reliability_data[annotators.index(annotation.who)][row_idx] = annotation.score

In [None]:
krippendorff.alpha(reliability_data, value_domain=[-2.0, -1.0, 1.0, 2.0], level_of_measurement="ordinal")

## Merge contiguous speech from the same NPC for partial ratio

In [None]:
from rapidfuzz.utils import default_process
import itertools


def myratio(s1, s2, **kwargs):
    len_ratio = (len(s1) / len(s2)) if len(s1) > len(s2) else (len(s2) / len(s1))
    len_ratio = ((len_ratio - 1) / 5) + 1
    return fuzz.token_set_ratio(s1, s2, **kwargs) / len_ratio


same_suggestion_ids = {}
for experiment in experiments:
    experiment_id = experiment.id
    same_pairs = []
    annotations_by_suggestion_id = get_annotation_index(experiment_id)
    for (id1, an1), (id2, an2) in itertools.permutations(annotations_by_suggestion_id.items(), 2):
        an1 = an1[0]
        an2 = an2[0]
        if not (
            an1.entry.suggestion["suggest_type"] == "foundry"
            and an1.entry.suggestion["action"]["type"] == "send_npc_speech"
        ):
            continue
        if suggestions_are_same(
            an1.entry,
            an2.entry,
            tolerance=300 if an1.score > 0 or an2.score > 0 else 30,
            npc_speech_similarity_ratio=myratio,
            npc_speech_similarity_threshold=80,
        ):
            # print(an1, an2)
            print(an1.entry.suggestion["action"]["text"], an1.score)
            print(an2.entry.suggestion["action"]["text"], an2.score)
            print(
                myratio(
                    an1.entry.suggestion["action"]["text"],
                    an2.entry.suggestion["action"]["text"],
                    processor=default_process,
                )
            )
            print()
            same_pairs.append((an1.suggestion_id, an2.suggestion_id))

    # build equivalence groups
    equivalence_groups = []
    for a, b in same_pairs:
        for grp in equivalence_groups:
            if a in grp or b in grp:
                grp.add(a)
                grp.add(b)
                break
        else:
            equivalence_groups.append({a, b})
    same_suggestion_ids[experiment_id] = equivalence_groups

In [None]:
for session_id, sames in same_suggestion_ids.items():
    print(session_id, len(sames))

## Write for manual deduplication

In [None]:
for experiment in experiments:
    experiment_id = experiment.id

    annotations_by_suggestion_id = get_annotation_index(experiment_id)

    # if the score is all the same, only keep the first in a group
    for suggestion_id, suggestion_annotations in annotations_by_suggestion_id.items():
        if all(a.score > 0 for a in suggestion_annotations) or all(a.score < 0 for a in suggestion_annotations):
            annotations_by_suggestion_id[suggestion_id] = [suggestion_annotations[0]]

    # filter to only the positive annotations, make a copy
    annotations_by_suggestion_id_all = annotations_by_suggestion_id.copy()
    for suggestion_id, suggestion_annotations in annotations_by_suggestion_id.copy().items():
        if not any(a.score > 0 for a in suggestion_annotations):
            annotations_by_suggestion_id.pop(suggestion_id)

    # add all the stopwatch annotations to any matching suggestion
    stopwatch_annotations = all_stopwatch_annotations[experiment_id]
    unmatched_stopwatch_annotations = []
    for stopwatch_annotation in stopwatch_annotations:
        # fix: make sure SG doesn't match SGK
        if "Ser Gordon" in stopwatch_annotation.matches:
            stopwatch_annotation.matches[stopwatch_annotation.matches.index("Ser Gordon")] = 'Ser Gordon"'

        for suggestion_annotations in annotations_by_suggestion_id.values():
            annotation = suggestion_annotations[0]
            if all(s.lower() in str(annotation.entry.suggestion).lower() for s in stopwatch_annotation.matches) and (
                stopwatch_annotation.time - 30
            ) <= annotation.entry.end <= (stopwatch_annotation.time + 300):
                suggestion_annotations.append(stopwatch_annotation)
                break
        else:
            # not matched
            # print(stopwatch_annotation)
            unmatched_stopwatch_annotations.append(stopwatch_annotation)

    # get the final dedup groups
    dedup_groups = []  # (time, annotations[])
    processed = set()
    for suggestion_id, suggestion_annotations in annotations_by_suggestion_id.items():
        if suggestion_id in processed:
            continue

        suggestion_annotations = suggestion_annotations.copy()

        # merge equivalent annotations into the same dedup group
        for equivalence_group in same_suggestion_ids[experiment_id]:
            if suggestion_id in equivalence_group:
                processed.add(suggestion_id)
                for equiv_id in equivalence_group:
                    if equiv_id == suggestion_id:
                        continue
                    processed.add(equiv_id)
                    suggestion_annotations.extend(annotations_by_suggestion_id_all[equiv_id])

        dedup_groups.append((suggestion_annotations[0].entry.end, suggestion_annotations))
    for stopwatch_annotation in unmatched_stopwatch_annotations:
        dedup_groups.append((stopwatch_annotation.time, [stopwatch_annotation]))

    with open(f"../annotations/to-dedup-{experiment_id}.jsonl", "w") as f:
        for ts, group in sorted(dedup_groups, key=lambda grp: grp[0]):
            for item in group:
                f.write(item.model_dump_json())
                f.write("\n")
            f.write("\n")