In [None]:
import collections
import dataclasses
import typing

import matplotlib.pyplot as plt
import numpy as np
import tqdm

import qut01

qut01.utils.logging.setup_logging_for_analysis_script()


@dataclasses.dataclass
class AnnotationAgreementStats:
    # note: we keep counts only for statements that have at least one annotation
    annot1_pos_counts: typing.List[int] = dataclasses.field(default_factory=list)
    annot1_neg_counts: typing.List[int] = dataclasses.field(default_factory=list)
    annot2_pos_counts: typing.List[int] = dataclasses.field(default_factory=list)
    annot2_neg_counts: typing.List[int] = dataclasses.field(default_factory=list)

    agreed_positive_counts: typing.List[int] = dataclasses.field(default_factory=list)
    agreed_negative_counts: typing.List[int] = dataclasses.field(default_factory=list)
    disagreed_counts: typing.List[int] = dataclasses.field(default_factory=list)

In [None]:
dataset_path = qut01.data.dataset_parser.get_default_deeplake_dataset_path()
dataset = qut01.data.dataset_parser.get_deeplake_dataset(  # this will load the deeplake dataset itself
    dataset_path=dataset_path,
    checkout_branch=qut01.data.dataset_parser.dataset_annotated_branch_name,  # NOTE: not validated data!
)
data_parser = qut01.data.dataset_parser.DataParser(  # this will give us a easy-to-use parser for the dataset
    dataset_path_or_object=dataset,
    use_processed_data_cache=False,  # we will iterate over the entire dataset below, caching might go out of memory
)
potentially_annotated_statement_ids = data_parser.get_potentially_annotated_statement_ids()
all_annot_sids = set([sid for annot_sids in potentially_annotated_statement_ids.values() for sid in annot_sids])
target_sidxs = [data_parser.statement_ids.index(sid) for sid in all_annot_sids]
target_annots = qut01.data.classif_utils.ANNOT_C2C3C4C5C6_CLASS_NAMES

In [None]:
agreement_stats = {target_annot: AnnotationAgreementStats() for target_annot in target_annots}

for target_sidx in tqdm.tqdm(target_sidxs, desc="extracting annotations from statements"):
    processed_data = data_parser.get_processed_data(target_sidx)
    target_annot_counts = [processed_data.annotation_counts.get(annot, 0) for annot in target_annots]
    assert all([c <= 2 for c in target_annot_counts])  # should never have more than double-annotated statements
    has_any_target_annot = any(target_annot_counts)
    if not has_any_target_annot:  # if there are no target annotations found at all, skip the statement
        continue
    for target_annot in target_annots:
        # if we discarded the annotated data for one of the two annotators, skip this agreement analysis
        if processed_data.annotation_counts.get(target_annot, 0) < 2:
            continue
        # for chance agreement stats, log the positive/negative counts for each annotator independently
        found_annots = [a for a in processed_data.annotations if a.name == target_annot]
        assert len(found_annots) == 2  # should be double-annotated, always?
        annot1, annot2 = found_annots
        annot1_pos_sentences = sum([len(chunk.matched_sentences_orig_idxs) for chunk in annot1.chunks])
        annot2_pos_sentences = sum([len(chunk.matched_sentences_orig_idxs) for chunk in annot2.chunks])
        total_sentences = len(processed_data.sentences)
        agreement_stats[target_annot].annot1_pos_counts.append(annot1_pos_sentences)
        agreement_stats[target_annot].annot2_pos_counts.append(annot2_pos_sentences)
        agreement_stats[target_annot].annot1_neg_counts.append(total_sentences - annot1_pos_sentences)
        agreement_stats[target_annot].annot2_neg_counts.append(total_sentences - annot2_pos_sentences)
        # now, go sentence by sentence, check the agreement, and log stats about what we see
        agreed_positive, agreed_negative, disagreed = 0, 0, 0
        for sentence_annot_counts in processed_data.sentence_annotation_counts:
            # relevant sentences are either extracted by no one (agreement), one (disagreement), or both (agreement)
            if sentence_annot_counts[target_annot] == 0:
                agreed_negative += 1
            elif sentence_annot_counts[target_annot] == 2:
                agreed_positive += 1
            elif sentence_annot_counts[target_annot] == 1:
                disagreed += 1
            else:
                raise AssertionError("sentence was annotated by more than two annotators?")
        agreement_stats[target_annot].agreed_positive_counts.append(agreed_positive)
        agreement_stats[target_annot].agreed_negative_counts.append(agreed_negative)
        agreement_stats[target_annot].disagreed_counts.append(disagreed)

In [None]:
# import pickle
#
# with open("/tmp/agreement_stats.pkl", "wb") as fd:
#     pickle.dump(agreement_stats, fd)

In [None]:
# import pickle
#
# with open("/tmp/agreement_stats.pkl", "rb") as fd:
#     agreement_stats = pickle.load(fd)

In [None]:
print("IAA (as proposed in the paper, but computed after real sentence matching):")

plt.figure(figsize=(10, 6))
sentence_count_bin_size = 5
plot_only_statements_with_at_least_one_positive = True

target_annot_ious = {}
total_ious = []

for target_annot in target_annots:
    sentence_counts = [
        apos + aneg + disag
        for apos, aneg, disag in zip(
            agreement_stats[target_annot].agreed_positive_counts,
            agreement_stats[target_annot].agreed_negative_counts,
            agreement_stats[target_annot].disagreed_counts,
        )
    ]
    iou_scores = np.asarray(
        [
            (agreed / (agreed + disagreed)) if (agreed + disagreed) > 0 else 1.0
            for agreed, disagreed, tot in zip(
                agreement_stats[target_annot].agreed_positive_counts,
                agreement_stats[target_annot].disagreed_counts,
                sentence_counts,
            )
            if tot > 0
        ]
    )
    print(f"\t{target_annot}: {np.mean(iou_scores):.2f}  (stddev={np.std(iou_scores):.2f})")
    target_annot_ious[target_annot] = np.mean(iou_scores)
    total_ious.extend(iou_scores)

    ious_per_binned_sentence_count = collections.defaultdict(list)
    for statement_idx, (sentence_count, iou) in enumerate(
        zip(
            [tot_count for tot_count in sentence_counts if tot_count > 0],
            iou_scores,
        )
    ):
        if plot_only_statements_with_at_least_one_positive:
            if not (
                agreement_stats[target_annot].annot1_pos_counts[statement_idx]
                or agreement_stats[target_annot].annot2_pos_counts[statement_idx]
            ):
                continue
        ious_per_binned_sentence_count[sentence_count // sentence_count_bin_size].append(iou)
    mean_iou_per_sentence_count = [
        [count * sentence_count_bin_size, np.mean(ious)] for count, ious in ious_per_binned_sentence_count.items()
    ]
    sorted_pairs = sorted(mean_iou_per_sentence_count, reverse=True)
    x_vals, y_vals = zip(*sorted_pairs)

    plt.plot(x_vals, y_vals, label=target_annot)

print(f"overall (macro): {np.mean([v for v in target_annot_ious.values()]):.2f}")
print(f"overall (micro): {np.mean(total_ious):.2f}")

plt.xlim(0, 300)
plt.xlabel("Total sentence count", fontsize=12)
plt.ylabel("Average IAA (true-only)", fontsize=12)
plt.legend()
plt.tight_layout()
plt.savefig("iaa_vs_statement_length.pdf")
plt.show()

In [None]:
print("IAA (classic 'observed agreement' definition, which includes negatives):")
for target_annot in target_annots:
    iou_scores = np.asarray(
        [
            (agreed_pos_count + agreed_neg_count) / (agreed_pos_count + agreed_neg_count + disagreed_count)
            for agreed_pos_count, agreed_neg_count, disagreed_count in zip(
                agreement_stats[target_annot].agreed_positive_counts,
                agreement_stats[target_annot].agreed_negative_counts,
                agreement_stats[target_annot].disagreed_counts,
            )
            if (agreed_pos_count + agreed_neg_count + disagreed_count) > 0
        ]
    )
    print(f"\t{target_annot}: {np.mean(iou_scores):.3f}  (stddev={np.std(iou_scores):.3f})")

In [None]:
print("Cohen's Kappa:")
for target_annot in target_annots:
    # the 'chance agreement' probability is defined based on: https://en.wikipedia.org/wiki/Cohen%27s_kappa
    total_sentences = sum(  # note: here, we total across all statements
        np.asarray(agreement_stats[target_annot].annot1_pos_counts)
        + np.asarray(agreement_stats[target_annot].annot1_neg_counts)
    )
    total_annot1_pos = sum(np.asarray(agreement_stats[target_annot].annot1_pos_counts))
    total_annot1_neg = sum(np.asarray(agreement_stats[target_annot].annot1_neg_counts))
    total_annot2_pos = sum(np.asarray(agreement_stats[target_annot].annot2_pos_counts))
    total_annot2_neg = sum(np.asarray(agreement_stats[target_annot].annot2_neg_counts))
    chance_agreement = ((total_annot1_pos * total_annot2_pos) + (total_annot1_neg * total_annot2_neg)) / (
        total_sentences**2
    )
    # note: same 'observed agreement' as with the classic IAA approach above
    observed_agreement = np.mean(
        np.asarray(
            [
                (agreed_pos_count + agreed_neg_count) / (agreed_pos_count + agreed_neg_count + disagreed_count)
                for agreed_pos_count, agreed_neg_count, disagreed_count in zip(
                    agreement_stats[target_annot].agreed_positive_counts,
                    agreement_stats[target_annot].agreed_negative_counts,
                    agreement_stats[target_annot].disagreed_counts,
                )
                if (agreed_pos_count + agreed_neg_count + disagreed_count) > 0
            ]
        )
    )
    kappa = (observed_agreement - chance_agreement) / (1 - chance_agreement)
    print(f"\t{target_annot}: {kappa:.2f}  (chance agreement={chance_agreement:.2f})")