# MUC-4 Document Selection and Analysis

This notebook contains the code used to select documents for the MUC-4 template reannotation study (see `reannotation_study/`), as well as:

1. Some exploratory analysis on the MUC-4 data
2. Code for generating the "collapsed" template predictions for IterX, GTT, and TempGen (included in `model_predictions/`)
3. Some analysis of the template predictions for IterX, GTT, and TempGen, which is featured in Table 2 of the paper.

In [14]:
from collections import Counter, defaultdict
from copy import deepcopy
from typing import Any, Dict, List, Union

import json
import numpy as np
import random

# The data directory (NOTE: you must unzip the muc.zip
# archive in this directory before running the rest of
# this notebook.)
DATA_DIR = "data/"

# Holds information about dates the articles were written,
# which is important information for annotation
MUC_TRAIN_DATE_FILE = DATA_DIR + "doc_train"

# Train, dev, and test splits
MUC_TRAIN = DATA_DIR + "train.jsonl"
MUC_DEV = DATA_DIR + "dev.jsonl"
MUC_TEST = DATA_DIR + "test.jsonl"

# MUC-4 ontology info
MUC_TEMPLATE_TYPES = {"arson", "attack", "bombing", "forced work stoppage", "kidnapping", "robbery"}
MUC_SLOT_TYPES = {"PerpInd", "PerpOrg", "Target", "Victim", "Weapon"}
MUC_SLOT_TYPE_LOWER_TO_UPPER = {"perpind": "PerpInd",
                                "perporg": "PerpOrg",
                                "target": "Target",
                                "victim": "Victim",
                                "weapon": "Weapon",
                                "incident_type": "incident_type"}

# Template filling model predictions for MUC-4
PREDICTIONS_DIR = "model_predictions/"
ITERX_PREDS = PREDICTIONS_DIR + "iterx_preds.json"
GTT_PREDS = PREDICTIONS_DIR + "gtt_preds.json"
TEMPGEN_PREDS = PREDICTIONS_DIR + "tempgen_preds.json"

The cell below loads the MUC-4 data and the model predictions from the above files

In [15]:
#
# Load the MUC-4 data
#
muc_train = []
muc_dev = []
muc_test = []
with open(MUC_TRAIN) as f, open(MUC_TRAIN_DATE_FILE) as g:
    for f_line, g_line in zip(f,g):
        muc_train_doc = json.loads(f_line)
        muc_train_doc["dateline"] = json.loads(g_line)["dateline"]
        muc_train.append(muc_train_doc)
with open(MUC_DEV) as f:
    for line in f:
        muc_dev.append(json.loads(line))
with open(MUC_TEST) as f:
    for line in f:
        muc_test.append(json.loads(line))

#
# Load the IterX, GTT, and TempGen MUC-4 predictions
# into a standardized format (the format used by GTT)
# 
gtt_preds = None
with open(GTT_PREDS) as f:
    gtt_preds = json.load(f)
with open(TEMPGEN_PREDS) as f:
    tempgen_preds = json.load(f)
    
def convert_doc_id(doc_id: str) -> str:
    return f"{doc_id[3]}{doc_id[-4:]}"

# Load IterX predictions into the same format as the GTT ones
iterx_preds = deepcopy(gtt_preds)
for k,v in iterx_preds.items():
    v['pred_templates'] = []
with open(ITERX_PREDS) as f:
    for line in f:
        entry = json.loads(line)
        has_attack_template = False
        for k,v in entry.items():
            for temp in v:
                converted_temp = {k: temp.get(k, []) for k in MUC_SLOT_TYPES}
                converted_temp['incident_type'] = temp['incident_type']
                iterx_preds[convert_doc_id(k)]['pred_templates'].append(converted_temp)

# Load TempGen predictions into the same format as the GTT ones
new_tempgen_preds = defaultdict(dict)
# Add gold templates
for k,v in tempgen_preds['gold_templates'].items():
    new_tempgen_preds[k]['gold_templates'] = []
    for template in v.values():
        formatted_template = {MUC_SLOT_TYPE_LOWER_TO_UPPER[k]: v for k,v in template.items()}
        new_tempgen_preds[k]['gold_templates'].append(formatted_template)
# Add predicted templates
for k,v in tempgen_preds['pred_templates'].items():
    new_tempgen_preds[k]['pred_templates'] = []
    for template in v:
        formatted_template = dict()
        formatted_template['incident_type'] = template['incident_type']
        for slot_type in MUC_SLOT_TYPES:
            if slot_type.lower() in template:
                formatted_template[slot_type] = template[slot_type.lower()]
            else:
                formatted_template[slot_type] = []
        new_tempgen_preds[k]['pred_templates'].append(formatted_template)
tempgen_preds = new_tempgen_preds

To make the reannotation study as simple as possible, we focused on documents where the gold annotations contained templates of only a single type. The cell below identifies all such documents in the train split.

In [3]:
single_subject_docs_by_type_and_count = defaultdict(lambda: defaultdict(set))
for doc in muc_train:
    template_counts = Counter([t["incident_type"] for t in doc["templates"]])
    if len(template_counts) == 1:
        template_type, count = list(template_counts.items())[0]
        single_subject_docs_by_type_and_count[template_type][count].add(doc["docid"])

Next, we sample the actual documents to be annotated. Our goal was to obtain 10 documents per template type, but as we note in the paper, the imposition of the single template type restriction mentioned above meant that we ended up with fewer than 10 documents for three of the template types (indeed, there are under 10 documents for the `forced work stoppage` and `robbery` types in the train split to begin with).

In [17]:
seed = 1337
random.seed(seed)
all_sample_docs = []
for template_type in MUC_TEMPLATE_TYPES:
    num_samples = 0
    while num_samples < 10:
        for count, docs in single_subject_docs_by_type_and_count[template_type].items():
            if not docs:
                continue
            sorted_docs = sorted(docs)
            chosen_doc = random.choice(sorted_docs)
            all_sample_docs.append(chosen_doc)
            num_samples += 1
            docs.remove(chosen_doc)
            if num_samples == 10:
                break
        if all([len(docs) == 0 for docs in single_subject_docs_by_type_and_count[template_type].values()]):
            print(f"Found only {num_samples} samples for template type {template_type}")
            break

Found only 0 samples for template type forced work stoppage
Found only 0 samples for template type robbery
Found only 0 samples for template type arson


The following cell just takes the documents sampled above and generates the JSON files on which annotators performed their annotations for the reannotation study.

In [18]:
seed = 1337
random.seed(seed)
OUTPUT_FILE = "muc_sample.json"
WRITE_TO_FILE = False # you can set this to true to actual write the raw annotation file

def doc_to_annotation_item(doc: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "docid": doc["docid"],
        "dateline": doc["dateline"],
        "doctext": doc["doctext"],
        "arson": 0,
        "attack": 0,
        "bombing": 0,
        "forced work stoppage": 0,
        "kidnapping": 0,
        "robbery": 0,
        "comments": ""
    }
sample_docs = list(map(doc_to_annotation_item, [d for d in muc_train if d["docid"] in all_sample_docs]))

if WRITE_TO_FILE:
    with open(OUTPUT_FILE, "w") as f:
        json.dump(sample_docs, f, indent=2)

The following two cells just print various statistics about the MUC-4 train split. These are not essential to any of the analysis in the paper, but may be of interest.

In [20]:
# Documents with multiple instances of the same template type
multi_template_muc_train_docs = []
all_multi_instance_counts = []
multi_instance_docs_by_template_type = defaultdict(int)
has_template_docs_by_template_type = defaultdict(int)
has_only_template_docs_by_template_type = defaultdict(int)
num_docs_with_templates = 0
num_empty_templates = 0
total_templates = 0
empty_templates = 0
empty_templates_by_type = defaultdict(int)
template_counts = defaultdict(lambda: defaultdict(int))
template_counts_by_type = defaultdict(int)
for doc in muc_train:
    doc_id = doc["docid"]
    if not doc["templates"]:
        continue
    else:
        num_docs_with_templates += 1
    total_templates_for_doc = 0
    for t in doc["templates"]:
        template_counts[doc_id][t["incident_type"]] += 1
        template_counts_by_type[t["incident_type"]] += 1
        total_templates += 1
        total_templates_for_doc += 1
        has_filler = False
        for slot, fillers in t.items():
            if slot == "incident_type":
                continue
            elif fillers:
                has_filler = True
        if not has_filler:
            empty_templates += 1
            empty_templates_by_type[t["incident_type"]] += 1
    multi_instance_counts = []
    for template_type, count in template_counts[doc_id].items():
        if count == total_templates_for_doc:
            has_only_template_docs_by_template_type[template_type] += 1
        if count > 0:
            has_template_docs_by_template_type[template_type] += 1
        if count > 1:
            multi_instance_counts.append(count)
            multi_instance_docs_by_template_type[template_type] += 1
    if multi_instance_counts:
        multi_template_muc_train_docs.append(doc)
        all_multi_instance_counts.extend(multi_instance_counts)

print(f"{num_docs_with_templates} of {len(muc_train)} documents have at least one template instance")
for template_type, count in sorted(has_template_docs_by_template_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {count} documents have at least one {template_type} template")
for template_type, count in sorted(has_only_template_docs_by_template_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {count} documents have only {template_type} templates")
print(f"{total_templates} template instances in total")
for template_type, count in sorted(template_counts_by_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {count} of these are {template_type} templates")
print(f"{empty_templates} annotated template instances have no fillers at all")
for template_type, count in sorted(empty_templates_by_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {count} of these are {template_type} templates")
print(f"{len(multi_template_muc_train_docs)} documents have multiple instances of at least one template type")
for template_type, count in sorted(multi_instance_docs_by_template_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {count} document(s) have multiple {template_type} template instances")
print(f"Average {np.round(np.mean(all_multi_instance_counts), 3)} template instances per type across multi-instance cases")
print(f"Average {np.round(total_templates / len(muc_train), 3)} template instances per document")

700 of 1300 documents have at least one template instance
  479 documents have at least one attack template
  208 documents have at least one bombing template
  83 documents have at least one kidnapping template
  40 documents have at least one arson template
  14 documents have at least one robbery template
  4 documents have at least one forced work stoppage template
  383 documents have only attack templates
  140 documents have only bombing templates
  54 documents have only kidnapping templates
  9 documents have only arson templates
  3 documents have only robbery templates
1114 template instances in total
  636 of these are attack templates
  298 of these are bombing templates
  114 of these are kidnapping templates
  43 of these are arson templates
  17 of these are robbery templates
  6 of these are forced work stoppage templates
53 annotated template instances have no fillers at all
  49 of these are attack templates
  4 of these are kidnapping templates
173 documents have mu

In [21]:
max_template_type_counts = defaultdict(int)
for doc, counts in template_counts.items():
    for template_type, count in counts.items():
        max_count = max_template_type_counts[template_type]
        max_template_type_counts[template_type] = max(max_count, count)

# the maximum number of instances of each template type
# occurring in a single document
print(max_template_type_counts)

defaultdict(<class 'int'>, {'kidnapping': 14, 'attack': 6, 'bombing': 5, 'robbery': 4, 'arson': 2, 'forced work stoppage': 3})


The following cell just pretty-prints all documents that have multiple templates of the same type, along with their annotations. Again, this is mostly exploratory code &mdash; meant to give a sense of what such documents look like. However, it was used to help identify the additional examples of "challenging" documents included in Appendix C of the paper.

In [8]:
for doc in multi_template_muc_train_docs:
    print(f"Doc ID: {doc['docid']}")
    print("Text:")
    print(f"  {doc['doctext']}")
    for template in sorted(doc["templates"], key=lambda x: x['incident_type']):

        print('---')
        for slot, values in template.items():
            if slot == "incident_type":
                print(f"  Type: {template['incident_type']}")
            else:
                print(f"  {slot}:")
                print(f"    {', '.join([s[0] for ss in values for s in ss])}")
    print("=" * 80)

Doc ID: DEV-MUC3-0018
Text:
  official sources have reported that several guerrilla attacks and heavy fighting took place the evening of 9 january and this morning throughout the country, and as a result, three soldiers were killed and three others injured.    alleged guerrilla urban commandos launched two highpower bombs against a car dealership in downtown san salvador this morning.  a police report said that the attack set the building on fire, but did not result in any casualties although economic losses are heavy.    during the evening of 9 january, guerrilla urban commandos bombed two electricity facilities in different places in san salvador, which caused power outages in some areas of the capital.    meanwhile, the armed forces press committee (coprefa) reported today that three army soldiers were killed recently in clashes against members of the farabundo marti national liberation front (fmln) in different parts of the central and eastern regions of the country.    the war bul

  Target:
    hotel, the sheraton hotel
  Victim:
    joao baena soares
  Weapon:
    
---
  Type: attack
  PerpInd:
    
  PerpOrg:
    
  Target:
    
  Victim:
    ignacio ellacuria, ignacio martin baro, segundo montes, amando lopez, juan ramon moreno, joaquin lopez
  Weapon:
    
Doc ID: DEV-MUC3-0932
Text:
  colombian judge bernardo jaramillo uribe was assassinated at noon today in medellin, located 500 km northwest of bogota, it was reported today by the national police.    this is the fourth judge riddled with bullets by unidentified gunmen -- apparently paid assassins at the service of the drug trafficking mafias -- in medellin within 5 months. judge maria helena diaz was assassinated on 28 july; medellin higher court magistrate hector jimenez was killed on 17 october; and magistrate mariela espinoza was killed on 1 november.    jaramillo uribe worked as judge of the 13th criminal court in medellin, the headquarters of the medellin drug cartel.    a police report indicates that

Next, we turn to model predictions. The following two cells compare how the model-predicted template counts compare to gold template counts document-by-document. These results are printed in the second cell and are used to populate the first two rows of Table 2 in the paper.

In [24]:
# GTT
gtt_gold_vs_pred_template_counts = defaultdict(list)
gtt_all_pred_counts = []
gtt_all_gold_counts = []
for doc_id, entry in gtt_preds.items():
    gold_template_type_counts = Counter([t['incident_type'] for t in entry['gold_templates']])
    pred_template_type_counts = Counter([t['incident_type'] for t in entry['pred_templates']])
    gtt_all_gold_counts.append(len(entry['gold_templates']))
    gtt_all_pred_counts.append(len(entry['pred_templates']))
    for gold_template_type, gold_count in gold_template_type_counts.items():
        if gold_template_type in pred_template_type_counts:
            pred_count = pred_template_type_counts[gold_template_type]
            pred_template_type_counts.pop(gold_template_type)
        else:
            pred_count = 0
        gtt_gold_vs_pred_template_counts[gold_template_type].append((gold_count, pred_count))
    for pred_template_type, pred_count in pred_template_type_counts.items():
        gtt_gold_vs_pred_template_counts[pred_template_type].append((0, pred_count))
for template_type, gold_vs_pred in gtt_gold_vs_pred_template_counts.items():
    gtt_gold_vs_pred_template_counts[template_type] = Counter(gold_vs_pred)
    
# IterX
iterx_gold_vs_pred_template_counts = defaultdict(list)
iterx_all_pred_counts = []
iterx_all_gold_counts = []
for doc_id, entry in iterx_preds.items():
    gold_template_type_counts = Counter([t['incident_type'] for t in entry['gold_templates']])
    pred_template_type_counts = Counter([t['incident_type'] for t in entry['pred_templates']])
    iterx_all_pred_counts.append(len(entry['pred_templates']))
    iterx_all_gold_counts.append(len(entry['gold_templates']))
    for gold_template_type, gold_count in gold_template_type_counts.items():
        if gold_template_type in pred_template_type_counts:
            pred_count = pred_template_type_counts[gold_template_type]
            pred_template_type_counts.pop(gold_template_type)
        else:
            pred_count = 0
        iterx_gold_vs_pred_template_counts[gold_template_type].append((gold_count, pred_count))
    for pred_template_type, pred_count in pred_template_type_counts.items():
        iterx_gold_vs_pred_template_counts[pred_template_type].append((0, pred_count))
for template_type, gold_vs_pred in iterx_gold_vs_pred_template_counts.items():
    iterx_gold_vs_pred_template_counts[template_type] = Counter(gold_vs_pred)
    
# TempGen
tempgen_gold_vs_pred_template_counts = defaultdict(list)
tempgen_all_pred_counts = []
tempgen_all_gold_counts = []
for doc_id, entry in tempgen_preds.items():
    gold_template_type_counts = Counter([t['incident_type'] for t in entry['gold_templates']])
    pred_template_type_counts = Counter([t['incident_type'] for t in entry['pred_templates']])
    tempgen_all_pred_counts.append(len(entry['pred_templates']))
    tempgen_all_gold_counts.append(len(entry['gold_templates']))
    for gold_template_type, gold_count in gold_template_type_counts.items():
        if gold_template_type in pred_template_type_counts:
            pred_count = pred_template_type_counts[gold_template_type]
            pred_template_type_counts.pop(gold_template_type)
        else:
            pred_count = 0
        tempgen_gold_vs_pred_template_counts[gold_template_type].append((gold_count, pred_count))
    for pred_template_type, pred_count in pred_template_type_counts.items():
        tempgen_gold_vs_pred_template_counts[pred_template_type].append((0, pred_count))
        
for template_type, gold_vs_pred in tempgen_gold_vs_pred_template_counts.items():
    tempgen_gold_vs_pred_template_counts[template_type] = Counter(gold_vs_pred)

In [25]:
all_gold_vs_pred_template_counts = {
    "gtt": gtt_gold_vs_pred_template_counts,
    "iterx": iterx_gold_vs_pred_template_counts,
    "tempgen": tempgen_gold_vs_pred_template_counts
}
all_pred_counts = {
    "gtt": gtt_all_pred_counts,
    "iterx": iterx_all_pred_counts,
    "tempgen": tempgen_all_pred_counts
}

all_gold_counts = {
    "gtt": gtt_all_gold_counts,
    "iterx": iterx_all_gold_counts,
    "tempgen": tempgen_all_gold_counts
}

for model, gold_vs_pred_template_counts in all_gold_vs_pred_template_counts.items():
    # The total number of documents
    total_docs = 0
    # For how many documents were there more gold templates than predicted templates?
    more_gold_than_pred = 0
    # For how many documents were there *at least as many* gold templates as predicted ones?
    at_least_as_many_gold_as_pred = 0
    # For how many documents was the total number of predicted templates incorrect?
    incorrect_template_number = 0
    for template_type, counts in gold_vs_pred_template_counts.items():
        for (num_gold, num_pred), count in counts.items():
            total_docs += count
            if num_gold == num_pred:
                at_least_as_many_gold_as_pred += count
            elif num_gold > num_pred:
                more_gold_than_pred += count
                at_least_as_many_gold_as_pred += count
                incorrect_template_number += count
            else:
                incorrect_template_number += count
    header = f"Model: {model}"
    print(header)
    print("=" * len(header))
    print(f"Fraction of documents with more gold templates than predicted templates: {np.mean([int(num_gold > num_pred) for (num_gold, num_pred) in zip(all_gold_counts[model], all_pred_counts[model]) if num_gold != num_pred])}")
    print(f"Fraction of documents with at least as many gold templates as predicted ones: {np.round(at_least_as_many_gold_as_pred / total_docs, 3)}")
    print(f"Fraction of incorrect predictions with more gold than predicted templates: {np.round(more_gold_than_pred / incorrect_template_number, 3)} ({more_gold_than_pred} / {incorrect_template_number})")
    print(f"Fraction of documents where the total number of predicted templates is incorrect: {np.mean([int(num_gold != num_pred) for (num_gold, num_pred) in zip(all_gold_counts[model], all_pred_counts[model])])}")

Model: gtt
Fraction of documents with more gold templates than predicted templates: 0.8125
Fraction of documents with at least as many gold templates as predicted ones: 0.84
Fraction of incorrect predictions with more gold than predicted templates: 0.735 (72 / 98)
Fraction of documents where the total number of predicted templates is incorrect: 0.32
Model: iterx
Fraction of documents with more gold templates than predicted templates: 0.7246376811594203
Fraction of documents with at least as many gold templates as predicted ones: 0.831
Fraction of incorrect predictions with more gold than predicted templates: 0.706 (72 / 102)
Fraction of documents where the total number of predicted templates is incorrect: 0.345
Model: tempgen
Fraction of documents with more gold templates than predicted templates: 0.8
Fraction of documents with at least as many gold templates as predicted ones: 0.872
Fraction of incorrect predictions with more gold than predicted templates: 0.771 (74 / 96)
Fraction of 

The next cell generates the "collapsed" versions of the model predictions in the paper, where we merge all predicted templates of the same type into a single aggregate template of that type, taking a union over the slot fillers of all predicted templates of that type.

In [26]:
def collapse_muc_templates(t1: Dict[str, List[str]], t2: Dict[str, List[str]]) -> Dict[str, List[str]]:
    assert t1['incident_type'] == t2['incident_type']
    assert t1.keys() == t2.keys()
    collapsed_template = dict()
    for k, v in t1.items():
        if k == 'incident_type':
            collapsed_template[k] = v
        else:
            collapsed_template[k] = []
            if t1[k] == t2[k]:
                collapsed_template[k] = t1[k]
            else:
                for filler in t2[k]:
                    if not any(set(filler).issubset(set(f)) for f in t1[k]):
                        collapsed_template[k].append(filler)
                for filler in t1[k]:
                    collapsed_template[k].append(filler)
    return collapsed_template
       
collapsed_gtt_preds = dict()
collapsed_iterx_preds = dict()
collapsed_tempgen_preds = dict()

all_collapsed_preds = [
    collapsed_gtt_preds,
    collapsed_iterx_preds,
    collapsed_tempgen_preds
]

all_preds = [
    gtt_preds,
    iterx_preds,
    tempgen_preds
]

output_files = [
    "model_predictions/gtt_preds_collapsed.json",
    "model_predictions/iterx_preds_collapsed.json",
    "model_predictions/tempgen_preds_collapsed.json"
]

# Set this to true if you want to actually
# dump these files (note that they are already
# included in the repo under model_predictions)
WRITE_TO_FILE = False

for collapsed_preds, original_preds, output_file in zip(all_collapsed_preds, all_preds, output_files):
    for doc_id, entry in original_preds.items():
        old_pred_templates = entry['pred_templates']
        new_pred_templates = dict()
        for pred_template in old_pred_templates:
            incident_type = pred_template['incident_type']
            if incident_type in new_pred_templates:
                existing_values = new_pred_templates[incident_type]
                new_pred_templates[incident_type] = collapse_muc_templates(new_pred_templates[incident_type], pred_template)
            else:
                new_pred_templates[incident_type] = pred_template
        new_entry = entry.copy()
        new_entry['pred_templates'] = list(new_pred_templates.values())
        collapsed_preds[doc_id] = new_entry
        
    if WRITE_TO_FILE:
        with open(output_file, 'w') as f:
            json.dump(collapsed_preds, f)

Here, we generate the "collapsed" version of the gold templates for comparison. CEAF-REE scores between these collapsed templates and the original (uncollapsed) templates are reported in-text in the last paragraph of section 5.

In [28]:
# Set to True only if you actually want to write this file
WRITE_TO_FILE = False

collapsed_gold_preds = deepcopy(collapsed_gtt_preds)
for doc, entry in collapsed_gold_preds.items():
    old_pred_templates = entry['pred_templates'] = deepcopy(entry['gold_templates'])
    new_pred_templates = dict()
    for pred_template in old_pred_templates:
        incident_type = pred_template['incident_type']
        if incident_type in new_pred_templates:
            existing_values = new_pred_templates[incident_type]
            new_pred_templates[incident_type] = collapse_muc_templates(new_pred_templates[incident_type], pred_template)
        else:
            new_pred_templates[incident_type] = pred_template
    collapsed_gold_preds[doc]['pred_templates'] = list(new_pred_templates.values())

if WRITE_TO_FILE:
    with open('model_predictions/collapsed_gold_preds.json', 'w') as f:
        json.dump(collapsed_gold_preds, f)