In [1]:
from pathlib import Path
import json
import pandas as pd

In [2]:
paths = [
    Path("datasets/muc/test.jsonl"),
    Path("datasets/muc/train.jsonl"),
    Path("datasets/muc/dev.jsonl"),
]

In [3]:
# read jsonl files
data = []
for file_path in paths:
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))

In [4]:
len(data)

1700

In [5]:
# find all data where the templates array is len == 1
data_1 = [d for d in data if len(d["templates"]) == 1]
len(data_1)

620

In [6]:
# find all data where the templates array is len == 0
data_2 = [d for d in data if len(d["templates"]) == 0]
len(data_2)

758

In [7]:
# data where the templates array is len > 1
len(data) - len(data_1) - len(data_2)

322

In [8]:
# we want a dataset that consists of documents with exactly one template
# and we want to include 10% of the documents with no templates
# so we need to sample 10% of data_2
import random

random.seed(42)
data_2_sample = random.sample(data_2, int(len(data_2) * 0.1))

# combine data_1 and data_2_sample
data_combined = data_1 + data_2_sample
len(data_combined)

695

In [9]:
# create a dataframe
data_combined[1]

{'docid': 'TST3-MUC4-0002',
 'doctext': 'those accused of the assassination of six jesuits will have a "fair trial" and if found guilty, will be punished whether they are civilians, military, or influential people, supreme court president dr mauricio gutierrez castro said.    the technical investigation commission has determined that some military were reportedly involved in the assassination of the six jesuits and their two maids, which took place at daybreak on 16 november, as reported by president alfredo cristiani on 7 january.    "the local and international community can rest assured the salvadoran judiciary system will not hesitate to enforce the law upon the authors of this horrible crime," dr gutierrez pointed out.    gutierrez said he does not know how many people are involved or their military ranks, because the commission to investigate criminal actions is still conducting investigations and has not presented its report to the tribunal.    however, general prosecutor dr mau

In [10]:
for data in data_combined:
    if len(data["templates"]) == 1:
        # flatten the lists, only consider the text, not the int offset
        template = data["templates"][0]
        for key, value in template.items():
            if type(value) == str:
                continue
                    
            texts = []
            for group in value:
                for item in group:
                    texts.append(item[0])
            template[key] = texts
        data.update(data["templates"][0])
    else:
        data.update({
            "incident_type": "none",
            "PerpInd": [],
            "PerpOrg": [],
            "Target": [],
            "Victim": [],
            "Weapon": [],
        })
    del data["templates"]

In [11]:
data_combined[5]

{'docid': 'TST3-MUC4-0019',
 'doctext': 'oil will not be pumped through the cano limon-covenas pipeline again until 30 may, because it was again blown up today near urum municipality, norte de santander department.',
 'incident_type': 'bombing',
 'PerpInd': [],
 'PerpOrg': [],
 'Target': ['cano limon-covenas pipeline', 'pipeline'],
 'Victim': [],
 'Weapon': []}

In [12]:
df = pd.DataFrame(data_combined)

In [13]:
unique_incident_types = df["incident_type"].unique()
unique_incident_types

array(['attack', 'kidnapping', 'bombing', 'attack / bombing', 'robbery',
       'arson', 'bombing / attack', 'none'], dtype=object)

In [14]:
# find all data where the incident_type is "attack / bombing"
df[df["incident_type"] == "attack / bombing"] = df[df["incident_type"] == "attack / bombing"].replace("attack / bombing", "attack")

In [15]:
# find all data where the incident_type is "bombing / attack"
df[df["incident_type"] == "bombing / attack"] = df[df["incident_type"] == "bombing / attack"].replace("bombing / attack", "bombing")

In [16]:
df.head()

Unnamed: 0,docid,doctext,incident_type,PerpInd,PerpOrg,Target,Victim,Weapon
0,TST3-MUC4-0001,we are not demanding that they stop their oper...,attack,[],[],[],[],[]
1,TST3-MUC4-0002,those accused of the assassination of six jesu...,attack,"[military, some military, military suspects, a...",[armed forces],[],[],[]
2,TST3-MUC4-0003,the national police reported today that over 1...,attack,[members of the maoist terrorist organization ...,[shining path],[],[enrique lopez albujar trint],[]
3,TST3-MUC4-0005,salvadoran social democratic politician hector...,kidnapping,[heavily armed men],[],[],"[hector oqueli colindres, gilda flores]",[]
4,TST3-MUC4-0011,the dissemination of a document questioning co...,kidnapping,[members of the manuel gustavo chacon sovereig...,"[eln, army of national liberation]",[],[],[]


In [17]:
# column renaming
df = df.rename(columns={
    "incident_type": "incident",
    "PerpInd": "perpetrator",
    "PerpOrg": "group perpetrator",
    "Target": "target",
    "Victim": "victim",
    "Weapon": "weapon"})
df.head()

Unnamed: 0,docid,doctext,incident,perpetrator,group perpetrator,target,victim,weapon
0,TST3-MUC4-0001,we are not demanding that they stop their oper...,attack,[],[],[],[],[]
1,TST3-MUC4-0002,those accused of the assassination of six jesu...,attack,"[military, some military, military suspects, a...",[armed forces],[],[],[]
2,TST3-MUC4-0003,the national police reported today that over 1...,attack,[members of the maoist terrorist organization ...,[shining path],[],[enrique lopez albujar trint],[]
3,TST3-MUC4-0005,salvadoran social democratic politician hector...,kidnapping,[heavily armed men],[],[],"[hector oqueli colindres, gilda flores]",[]
4,TST3-MUC4-0011,the dissemination of a document questioning co...,kidnapping,[members of the manuel gustavo chacon sovereig...,"[eln, army of national liberation]",[],[],[]


In [18]:
# make incident to a list
df["incident"] = df["incident"].apply(lambda x: [x])

In [19]:
df.head()

Unnamed: 0,docid,doctext,incident,perpetrator,group perpetrator,target,victim,weapon
0,TST3-MUC4-0001,we are not demanding that they stop their oper...,[attack],[],[],[],[],[]
1,TST3-MUC4-0002,those accused of the assassination of six jesu...,[attack],"[military, some military, military suspects, a...",[armed forces],[],[],[]
2,TST3-MUC4-0003,the national police reported today that over 1...,[attack],[members of the maoist terrorist organization ...,[shining path],[],[enrique lopez albujar trint],[]
3,TST3-MUC4-0005,salvadoran social democratic politician hector...,[kidnapping],[heavily armed men],[],[],"[hector oqueli colindres, gilda flores]",[]
4,TST3-MUC4-0011,the dissemination of a document questioning co...,[kidnapping],[members of the manuel gustavo chacon sovereig...,"[eln, army of national liberation]",[],[],[]


In [20]:
df.to_parquet("muc.parquet")