Imports & Data Loading

In [None]:
#Imports
import sys
import os
import pandas as pd

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))

if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from create_training_dataset_utils import (load_medcat_export,
    get_validated_entities, get_validated_dates, get_validated_relations, get_validated_relative_dates,
    doc_to_entities_json, doc_to_dates_json, doc_to_relations_json, doc_to_relations_value_json, doc_to_relative_dates_json,
    id2value_from_items, make_row)

In [None]:
#Define cuis for absolute and relative dates (these should align with the cuis that were used to add these terms in MedCAT Trainer)
DATE_CUI = "410671006"
RELATIVE_DATE_CUI = "118578006"

In [None]:
#Load data
export_path = "../data/MedCAT_Export_NPH.json"
export = load_medcat_export(export_path)

Explore Data

In [None]:
#Look at data
#export

In [None]:
#Flatten documents
docs = []
for proj in export.get("projects", []):
    for doc in proj.get("documents", []):
        docs.append(doc)

len(docs)

In [None]:
#Pick a document to test
doc = docs[0]
#doc

In [None]:
#Get validated clincial entities, absolute dates, relative dates and relations
ents = get_validated_entities(doc, date_cui=DATE_CUI)
dates = get_validated_dates(doc, date_cui=DATE_CUI)
relative_dates = get_validated_relative_dates(doc, relative_date_cui=RELATIVE_DATE_CUI)
relations = get_validated_relations(doc, date_cui=DATE_CUI, relative_date_cui=RELATIVE_DATE_CUI)

print("Clincial entities:", len(ents))
print("Absolute dates:", len(dates))
print("Relative dates:", len(relative_dates))
print("Relations:", len(relations))

In [None]:
#Look at first 5 of each
ents[:5], dates[:5], relative_dates[:5], relations[:5]

In [None]:
# Make relations human-readable (map IDs to values)
id2value = id2value_from_items(ents, dates, relative_dates)

readable_relations = [
    {
        "date_value": id2value.get(L["date_id"]),
        "entity_value": id2value.get(L["entity_id"]),
    }
    for L in relations
]
readable_relations

In [None]:
#Run across all documents and summarize counts
summary = []
for d in docs:
    summary.append({
        "doc_id": d.get("id"),
        "n_entities": len(get_validated_entities(d, DATE_CUI)),
        "n_dates": len(get_validated_dates(d, DATE_CUI)),
        "n_rel_dates": len(get_validated_relative_dates(d, RELATIVE_DATE_CUI)),
        "n_relations": len(get_validated_relations(d, DATE_CUI, RELATIVE_DATE_CUI)),
    })

pd.DataFrame(summary).sort_values("doc_id").reset_index(drop=True)

Create Dataset

In [None]:
#Extract validated entities, dates, relative dates and relations from all docs in dataset and append to rows
rows = []
for doc in docs:
    # Extract entities, dates, relative dates and relations
    ents = get_validated_entities(doc, DATE_CUI)
    dates = get_validated_dates(doc, DATE_CUI)
    relative_dates = get_validated_relative_dates(doc, RELATIVE_DATE_CUI)
    relations = get_validated_relations(doc, DATE_CUI, RELATIVE_DATE_CUI)

    # Mapping from validated items
    id2value = id2value_from_items(ents, dates, relative_dates)

    # Serialize each field
    entities_json = doc_to_entities_json(ents)
    dates_json = doc_to_dates_json(dates)
    relative_dates_json = doc_to_relative_dates_json(relative_dates)
    relations_json = doc_to_relations_value_json(relations, id2value)

    # Assemble row
    rows.append(make_row(
        doc_id=doc.get("id"),
        note_text=doc.get("text", ""),
        entities_json=entities_json,
        dates_json=dates_json,
        relative_dates_json=relative_dates_json,
        relations_json=relations_json
    ))

In [None]:
#Create dataframe
df = pd.DataFrame(rows, columns=["doc_id", "note_text", "entities_json", "dates_json", "relative_dates_json", "relations_json"])
df

In [None]:
#Save csv
out_path = "../data/training_dataset.csv"
df.to_csv(out_path, index=False)