# MUC-4 Template Count Reannotation Analysis

This notebook computes annotator and model agreement statistics on total reannotated/predicted template counts with respect to the gold template counts on subsets of documents from the MUC-4 corpus. More specifically, it computes:

- (Nominal and Ratio) Krippendorff's $\alpha$ on template count agreement between (1) all three expert human annotators, (2) all three expert human annotators *plus* the gold counts &mdash; both on a subset of 42 documents taken from the MUC-4 train split.
- (Nominal and Ratio) Krippendorff's $\alpha$ on template count agreement between the template count predictions of each model studied in the paper (IterX, GTT, and TempGen) and the gold counts on the MUC-4 test split.

In [1]:
import json
import pandas as pd
import simpledorff

from collections import defaultdict
from typing import *

In [17]:
# Model Predictions
PREDICTIONS_DIR = "../model_predictions/"
ITERX_MUC_PREDICTIONS = PREDICTIONS_DIR + "iterx_preds_converted.json"
GTT_MUC_PREDICTIONS = PREDICTIONS_DIR + "gtt_preds.json"
TEMPGEN_MUC_PREDICTIONS = PREDICTIONS_DIR + "tempgen_preds_converted.json"

# Human annotations
ANNOTATIONS_DIR = "./annotations"
MUC_ANNOTATIONS_DIR = ANNOTATIONS_DIR + "/muc"
A1_MUC_ANNOTATIONS = MUC_ANNOTATIONS_DIR + "/muc_sample_a1.json"
A2_MUC_ANNOTATIONS = MUC_ANNOTATIONS_DIR + "/muc_sample_a2.json"
A3_MUC_ANNOTATIONS = MUC_ANNOTATIONS_DIR + "/muc_sample_a3.json"

# Gold data
MUC_TRAIN_DATA = "../data/train.jsonl"
MUC_TEST_DATA = "../data/test.jsonl"

# Annotation metadata
NUM_MUC_DOCS = 42
NUM_MUC_ANNOTATORS = 3
MUC_TEMPLATE_TYPES = ["arson", "attack", "bombing", "kidnapping", "robbery"]

# Curiously, the simpledorff package does not include the
# appropriate metric function for ratio data, so we define it ourselves
EPS = 1e-5
RATIO_METRIC_FN = lambda x,y: ((x - y) / max((x + y), EPS)) ** 2

In [19]:
def validate_muc_annotation(annotations: Dict[str, Union[str, int]]) -> None:
    """validate JSON MUC-4 annotations
    
    Verifies that an annotator's template count annotations
      (1) cover all documents
      (2) have non-zero template counts for each document
      (3) (ideally, though not necessarily) include comments
    
    :param annotations: MUC-4 template count annotations from a single annotator
    """
    assert len(annotations) == NUM_MUC_DOCS
    for doc_id, annotation in annotations.items():
        if not annotation["comments"]:
            print(f"No comments provided for document {doc_id}!")
        for template_type in MUC_TEMPLATE_TYPES:
            assert annotation[template_type] >= 0
            
            
def annotations_to_df(
    annotations: Dict[str, Union[str, int]],
    annotator_id: str,
    dataset: str = "muc"
) -> pd.DataFrame:
    """Convert template count annotations to a DataFrame for IAA calculation
    
    :param annotations: MUC-4 template count annotations from a single annotator
    :param annotator_id: A unique identifier for this annotator
    :param dataset: A string indicating which dataset these annotations are for
        ("muc" is the only valid option for this public version of the notebook)
    :returns: A Pandas DataFrame object with columns for document ID, annotator ID,
        total template count, and template counts for each template type
    """
    if dataset == "muc":
        template_types = MUC_TEMPLATE_TYPES
    else:
        raise ValueError(f"Unrecognized dataset {dataset}! Choices are: {', '.join(template_types)}")
    # sort the annotations by doc ID for consistent ordering
    # across annotators
    sorted_annotations = list(sorted(annotations.values(), key=lambda x: x["docid"]))
    
    # create the DataFrame for the agreement calculation
    df = pd.DataFrame()
    df["doc_id"] = [ex["docid"] for ex in sorted_annotations]
    df["annotator_id"] = annotator_id
    for template_type in template_types:
        df[template_type] = [ex[template_type] for ex in sorted_annotations]
    df["total"] = df[template_types].sum(axis=1)
    
    return df


def muc_data_to_df(data: Dict[str, Any], subset_doc_ids: Set[str] = set()) -> pd.DataFrame:
    """Convert gold annotations to a DataFrame for IAA calculation
    
    :param data: the gold data to be converted to a DataFrame
    :param subset_doc_ids: if specified, only annotations for the specified documents
        will be returned
        
    :returns: a DataFrame with documents along the rows and with: (1) two columns per
        template type, one denoting how many templates of that type occur in
        each document and another denoting whether any templates of that type occur
        in the document *at all*; (2) a column giving the total number of templates
        annotated for a document; (3) a column for the annotator ID; (4) a column for
        the doc ID
    """
    # We treat the gold data as "a(nnotator) 0"
    # (the human experts are annotators 1-3)
    DATA_ANNOTATOR_ID = "a0"
    
    # Sort gold annotations by doc ID, same
    # as we do for the human annotators
    sorted_data = sorted(data.items())
    
    # Collect gold template counts for each document
    all_template_counts = defaultdict(list)
    doc_ids = []
    for doc_id, annotation in sorted_data:
        if subset_doc_ids and doc_id not in subset_doc_ids:
            continue
        doc_ids.append(doc_id)
        doc_template_counts = Counter([t["incident_type"] for t in annotation["templates"]])
        total = 0
        for template_type in MUC_TEMPLATE_TYPES:
            type_count = doc_template_counts.get(template_type, 0)
            total += type_count
            all_template_counts[template_type].append(type_count)
        all_template_counts["total"].append(total)
    
    return pd.DataFrame({"doc_id": doc_ids, "annotator_id": DATA_ANNOTATOR_ID} | all_template_counts)


def muc_predictions_to_df(preds: Dict[str, Any], model_name: str) -> pd.DataFrame:
    """Convert template filling model predictions to a DataFrame for IAA calculation
    
    Converts predictions for each of the three models studied
    in the paper (IterX, TempGen, and GTT)
    
    :param preds: the model predictions
    :param model_name: the name of the model whose predictions these are
    :returns: a Pandas DataFrame to be used to compute template count agreement,
        formatted the same way as the output of muc_data_to_df (see above).
    """
    all_template_counts = defaultdict(list)
    doc_ids = []
    
    # Collect predicted template counts for each document
    sorted_preds = sorted(preds.items())
    for doc_id, entry in sorted_preds:
        predicted_templates = entry["pred_templates"]
        formatted_doc_id = f"TST{doc_id[0]}-MUC4-{doc_id[1:]}"
        doc_ids.append(formatted_doc_id)
        doc_template_counts = Counter([t["incident_type"] for t in predicted_templates])
        total = 0
        for template_type in MUC_TEMPLATE_TYPES:
            type_count = doc_template_counts.get(template_type, 0)
            total += type_count
            all_template_counts[template_type].append(type_count)
        all_template_counts["total"].append(total)
        
    return pd.DataFrame({"doc_id": doc_ids, "annotator_id": model_name} | all_template_counts)

In [20]:
# Load the gold MUC-4 train and test data
with open(MUC_TRAIN_DATA) as f:
    gold_muc_data = {}
    for line in f:
        example = json.loads(line)
        gold_muc_data[example['docid']] = example
with open(MUC_TEST_DATA) as f:
    gold_muc_test_data = {}
    for line in f:
        example = json.loads(line)
        gold_muc_test_data[example['docid']] = example
        
# Load the human-annotated template counts
with open(A1_MUC_ANNOTATIONS) as f:
    a1_muc_data = {ex['docid']: ex for ex in json.load(f)}
    validate_muc_annotation(a1_muc_data)
with open(A2_MUC_ANNOTATIONS) as f:
    a2_muc_data = {ex['docid']: ex for ex in json.load(f)}
    validate_muc_annotation(a2_muc_data)
with open(A3_MUC_ANNOTATIONS) as f:
    a3_muc_data = {ex['docid']: ex for ex in json.load(f)}
    validate_muc_annotation(a3_muc_data)
    
# Load the template predictions for each model
with open(ITERX_MUC_PREDICTIONS) as f:
    iterx_muc_preds = json.load(f)
with open(GTT_MUC_PREDICTIONS) as f:
    gtt_muc_preds = json.load(f)
with open(TEMPGEN_MUC_PREDICTIONS) as f:
    tempgen_muc_preds = json.load(f)

No comments provided for document DEV-MUC3-0434!
No comments provided for document DEV-MUC3-0461!
No comments provided for document DEV-MUC3-0618!
No comments provided for document DEV-MUC3-0635!
No comments provided for document DEV-MUC3-0663!
No comments provided for document DEV-MUC3-1153!


In [21]:
# Construct annotation DataFrames for each annotator
a1_muc_df = annotations_to_df(a1_muc_data, "a1", "muc")
a2_muc_df = annotations_to_df(a2_muc_data, "a2", "muc")
a3_muc_df = annotations_to_df(a2_muc_data, "a3", "muc")

# Convert the gold MUC-4 data to a DataFrame, formatted
# the same way as the annotator DataFrames
muc_data_df = muc_data_to_df(gold_muc_data, set(a1_muc_data.keys()))

# Two version of the final DataFrames for the IAA calculation:
# one *with* the gold data and one without it. (We are interested
# in comparing how agreement changes between these two settings.)
muc_iaa_df = pd.concat([muc_data_df, a1_muc_df, a2_muc_df, a3_muc_df]).reset_index()
muc_annotator_only_df = pd.concat([a1_muc_df, a2_muc_df, a3_muc_df]).reset_index()
assert len(muc_iaa_df) == NUM_MUC_DOCS * (NUM_MUC_ANNOTATORS + 1)
assert len(muc_annotator_only_df) == NUM_MUC_DOCS * (NUM_MUC_ANNOTATORS)

Below is the actual agreement calculation for the reannotation study. In all cases, we're considering only a subset of documents on which *at least one* instance of a particular template type is annotated. We manipulate whether agreement is computed (1) *with* the gold data included (vs. among annotators alone), or (2) on *all* documents with at least one template of a given type (vs. only documents with *multiple* instances of that type &mdash; figures not reported in the paper). These results are the basis for the MUC-4 results in Tables 1 and 5 in the paper. The results are written to `muc_agreement.csv` in this directory.

In [14]:
template_types = []
settings = []
nominal_alphas = []
ratio_alphas = []
num_documents = []
num_documents_with_template_type = []
for template_type in MUC_TEMPLATE_TYPES + ["total"]:
    # Identify all documents whose gold annotations have *at least one* template of this type
    gold_docs_with_template = list(muc_iaa_df[(muc_iaa_df[template_type] > 0) & (muc_iaa_df["annotator_id"] == "a0")]["doc_id"])
    muc_iaa_df_with_template = muc_iaa_df[muc_iaa_df["doc_id"].isin(gold_docs_with_template)]
    muc_annotator_only_df_with_template = muc_annotator_only_df[muc_annotator_only_df["doc_id"].isin(gold_docs_with_template)]
    
    # Identify all documents whose gold annotations have multiple templates
    gold_docs_with_multiple_templates = list(muc_iaa_df[(muc_iaa_df[template_type] > 1) & (muc_iaa_df["annotator_id"] == "a0")]["doc_id"])
    muc_iaa_df_with_multiple_templates = muc_iaa_df[muc_iaa_df["doc_id"].isin(gold_docs_with_multiple_templates)]
    muc_annotator_only_df_with_multiple_templates = muc_annotator_only_df[muc_annotator_only_df["doc_id"].isin(gold_docs_with_multiple_templates)]
    
    # The four settings in which we compute agreement
    all_settings = ["at least one template (annotator only)",
                    "at least one template (+gold)",
                    "multiple templates (annotator only)",
                    "multiple templates (+gold)"]
    
    dfs = [muc_annotator_only_df_with_template,
           muc_iaa_df_with_template,
           muc_annotator_only_df_with_multiple_templates,
           muc_iaa_df_with_multiple_templates]
    num_docs_with_template_type = len(gold_docs_with_template)
    
    print(f"Agreement for {template_type} templates")
    print(f"=============={'=' * len(template_type)}==========")
    for setting, df in zip(all_settings, dfs):
        if "+" in setting:
            doc_count = len(df) // (NUM_MUC_ANNOTATORS + 1)
        else:
            doc_count = len(df) // NUM_MUC_ANNOTATORS
        template_types.append(template_type)
        settings.append(setting)
        num_documents.append(doc_count)
        num_documents_with_template_type.append(num_docs_with_template_type)
            
        template_count = df[template_type].sum()
        
        # Ratio alpha for template counts (all documents)
        ratio_alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
            df,
            experiment_col="doc_id",
            annotator_col="annotator_id",
            class_col=template_type,
            metric_fn=RATIO_METRIC_FN
        )
        ratio_alphas.append(ratio_alpha)

        # Nominal alpha for template presence (i.e. is there at least one template of a given type?)
        nominal_alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
            df,
            experiment_col="doc_id",
            annotator_col="annotator_id",
            class_col=template_type,
            metric_fn=simpledorff.metrics.nominal_metric
        )
        nominal_alphas.append(nominal_alpha)
        
        print(f"Setting: {setting}")
        print(f"---------{'-' * len(setting)}")
        print(f"Nominal alpha ({num_docs_with_template_type} documents, {template_count} annotated templates): {round(nominal_alpha, 3)}")
        print(f"Ratio alpha  ({num_docs_with_template_type} documents, {template_count} annotated templates): {round(ratio_alpha, 3)}\n")
    print()

output = pd.DataFrame(
    {"template_type": template_types,
     "setting": settings,
     "total documents": num_documents,
     "total documents with template type": num_documents_with_template_type,
     "nominal alpha": nominal_alphas,
     "ratio alpha": ratio_alphas,
    }
)
with open("./muc_agreement.csv", "w") as f:
    output.to_csv(f)

Agreement for arson templates
Setting: at least one template (annotator only)
-----------------------------------------------
Nominal alpha (9 documents, 36 annotated templates): 0.581
Ratio alpha  (9 documents, 36 annotated templates): 0.484

Setting: at least one template (+gold)
--------------------------------------
Nominal alpha (9 documents, 46 annotated templates): 0.352
Ratio alpha  (9 documents, 46 annotated templates): 0.354

Setting: multiple templates (annotator only)
--------------------------------------------
Nominal alpha (9 documents, 9 annotated templates): 0.0
Ratio alpha  (9 documents, 9 annotated templates): 0.0

Setting: multiple templates (+gold)
-----------------------------------
Nominal alpha (9 documents, 11 annotated templates): 0.0
Ratio alpha  (9 documents, 11 annotated templates): 0.0


Agreement for attack templates
Setting: at least one template (annotator only)
-----------------------------------------------
Nominal alpha (10 documents, 51 annotated te

The remaining cells compute template count agreement between the predictions of three template filling models (IterX, GTT, and TempGen) and the gold data. These results are the basis for the MUC-4 results in Table 6 in the paper.

In [15]:
muc_test_data_df = muc_data_to_df(gold_muc_test_data)

muc_test_iterx_df = muc_predictions_to_df(iterx_muc_preds, "iterx")
muc_test_gtt_df = muc_predictions_to_df(gtt_muc_preds, "gtt")
muc_test_tempgen_df = muc_predictions_to_df(tempgen_muc_preds, "tempgen")

iterx_muc_iaa_df = pd.concat([muc_test_iterx_df, muc_test_data_df])
gtt_muc_iaa_df = pd.concat([muc_test_gtt_df, muc_test_data_df])
tempgen_muc_iaa_df = pd.concat([muc_test_tempgen_df, muc_test_data_df])

In [16]:
models = ["iterx", "gtt", "tempgen"]
prediction_dfs = [iterx_muc_iaa_df, gtt_muc_iaa_df, tempgen_muc_iaa_df]
for model, prediction_df in zip(models, prediction_dfs):
    print(model)
    print("=" * 80)
    for template_type in MUC_TEMPLATE_TYPES + ["total"]:
        # Identify all documents whose gold annotations have at least one template of this type
        gold_docs_with_template = list(prediction_df[(prediction_df[template_type] > 0) & (prediction_df["annotator_id"] == "a0")]["doc_id"])
        prediction_df_with_template = prediction_df[prediction_df["doc_id"].isin(gold_docs_with_template)]
        # Identify all documents whose gold annotations have multiple templates
        gold_docs_with_multiple_templates = list(prediction_df[(prediction_df[template_type] > 1) & (prediction_df["annotator_id"] == "a0")]["doc_id"])
        prediction_df_with_multiple_templates = prediction_df[prediction_df["doc_id"].isin(gold_docs_with_multiple_templates)]
        all_settings = ["at least one template",
                        "multiple templates"]
        dfs = [prediction_df_with_template, prediction_df_with_multiple_templates]
        num_docs_with_template_type = len(gold_docs_with_template)

        print(f"Agreement for {template_type} templates")
        print(f"=============={'=' * len(template_type)}==========")
        for setting, df in zip(all_settings, dfs):
            doc_count = len(df) // 2

            template_count = df[template_type].sum()
            # Alpha for template counts (all documents)
            ratio_alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
                df,
                experiment_col="doc_id",
                annotator_col="annotator_id",
                class_col=template_type,
                metric_fn=RATIO_METRIC_FN
            )
#             ratio_alphas.append(ratio_alpha)

            # Alpha for template presence (i.e. is there at least one
            # template of a given type?)
            nominal_alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
                df,
                experiment_col="doc_id",
                annotator_col="annotator_id",
                class_col=template_type,
                metric_fn=simpledorff.metrics.nominal_metric
            )
#             nominal_alphas.append(nominal_alpha)

            print(f"Setting: {setting}")
            print(f"---------{'-' * len(setting)}")
            print(f"Nominal alpha ({num_docs_with_template_type} documents, {template_count} annotated templates): {round(nominal_alpha, 3)}")
            print(f"Ratio alpha  ({num_docs_with_template_type} documents, {template_count} annotated templates): {round(ratio_alpha, 3)}\n")
        print()

iterx
Agreement for arson templates
Setting: at least one template
------------------------------
Nominal alpha (3 documents, 4 annotated templates): -0.25
Ratio alpha  (3 documents, 4 annotated templates): -0.25

Setting: multiple templates
---------------------------
Nominal alpha (3 documents, 0 annotated templates): 1.0
Ratio alpha  (3 documents, 0 annotated templates): 1.0


Agreement for attack templates
Setting: at least one template
------------------------------
Nominal alpha (87 documents, 202 annotated templates): 0.082
Ratio alpha  (87 documents, 202 annotated templates): -0.092

Setting: multiple templates
---------------------------
Nominal alpha (87 documents, 99 annotated templates): -0.172
Ratio alpha  (87 documents, 99 annotated templates): -0.144


Agreement for bombing templates
Setting: at least one template
------------------------------
Nominal alpha (45 documents, 98 annotated templates): 0.031
Ratio alpha  (45 documents, 98 annotated templates): -0.055

Setting