In [1]:
#Imports
import sys
import os

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))

if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from medcat_etl import ( load_medcat_export,
    get_validated_entities, get_validated_dates, get_validated_links, get_validated_relative_dates, DATE_CUI, RELATIVE_DATE_CUI,
    doc_to_entities_json, doc_to_dates_json, doc_to_links_json, doc_to_links_value_json, doc_to_relative_dates_json,
    id2value_from_items, make_row
)
import pandas as pd

In [2]:
#Load data
export_path = "../data/MedCAT_Export_With_Text_2025-09-11_08_19_37.json"
export = load_medcat_export(export_path)

In [3]:
#Flatten documents
docs = []
for proj in export.get("projects", []):
    for doc in proj.get("documents", []):
        docs.append(doc)

len(docs)

5

Testing

In [4]:
#Pick a document to test
doc = docs[0]
doc

{'id': 26461,
 'name': '0',
 'text': "Ultrasound (30nd Jun 2024): no significant findings.imp: asthma\n\nShe denies any nausea, vomiting, or diarrhea.\nC Patient reports compliance with current medication regimen. Basic metabolic panel within normal limits with sodium 140, potassium 4.2, creatinine 0.9.\nPatient is afebrile with normal vital signs. T (02nd Aug 2024): reveals asthma.imp: asthma\n\nX-ray (12nd Sep 2024): shows 3.1cm mass in brain.imp: pituitary_adenoma\n\nCLINIC VISIT (16 Sep'24): nausea/vomiting worsening confirmed rheumatoid_arthritis switch to aspirin\n\nPast medical history is non-contributory.\nURGENT REVIEW (23rd Oct 2024): headache x1 day.r Will order additional laboratory studies at next visit if symptoms persist. Heart: Regular rate and rhythm, no murmurs. Patient has a history of meningitis. GI: Bowel sounds present in all four quadrants.\n Liver function tests show mild elevation in ALT and AST, likely due to medication effect.Chest X-ray reveals clear lung fi

In [5]:
#Get validated entities, dates, relative dates and relationships
ents = get_validated_entities(doc, date_cui=DATE_CUI)
dates = get_validated_dates(doc, date_cui=DATE_CUI)
relative_dates = get_validated_relative_dates(doc, relative_date_cui=RELATIVE_DATE_CUI)
links = get_validated_links(doc, date_cui=DATE_CUI, relative_date_cui=RELATIVE_DATE_CUI)

print("Entities:", len(ents))
print("Dates:", len(dates))
print("Relative dates:", len(relative_dates))
print("Links:", len(links))

Entities: 64
Dates: 6
Relative dates: 0
Links: 4


In [6]:
#Look at first 5 of each
ents[:5], dates[:5], links[:5]

([{'id': 308244,
   'value': 'history of meningitis',
   'cui': '161478002',
   'start': 756,
   'end': 777},
  {'id': 308245,
   'value': 'rheumatoid_arthritis',
   'cui': '69896004',
   'start': 491,
   'end': 511},
  {'id': 308248,
   'value': 'current medication',
   'cui': '513881000000106',
   'start': 145,
   'end': 163},
  {'id': 308249,
   'value': 'normal vital signs',
   'cui': '72970002',
   'start': 289,
   'end': 307},
  {'id': 308252,
   'value': 'pituitary_adenoma',
   'cui': '254956000',
   'start': 410,
   'end': 427}],
 [{'id': 308320, 'value': '30nd Jun 2024', 'start': 12, 'end': 25},
  {'id': 308321, 'value': '12nd Sep 2024', 'start': 363, 'end': 376},
  {'id': 308322, 'value': "16 Sep'24", 'start': 443, 'end': 452},
  {'id': 308323, 'value': '23rd Oct 2024', 'start': 588, 'end': 601},
  {'id': 308324, 'value': '16st Nov 2024', 'start': 1205, 'end': 1218}],
 [{'date_id': 308321, 'entity_id': 308252},
  {'date_id': 308322, 'entity_id': 308245},
  {'date_id': 308323,

In [7]:
# Make links human-readable (map IDs to values) using validated items you already extracted
id2value = id2value_from_items(ents, dates, relative_dates)  # FIX: Add relative_dates

readable_links = [
    {
        "date_id": L["date_id"],
        "date_value": id2value.get(L["date_id"]),
        "entity_id": L["entity_id"],
        "entity_value": id2value.get(L["entity_id"]),
    }
    for L in links
]
readable_links

[{'date_id': 308321,
  'date_value': '12nd Sep 2024',
  'entity_id': 308252,
  'entity_value': 'pituitary_adenoma'},
 {'date_id': 308322,
  'date_value': "16 Sep'24",
  'entity_id': 308245,
  'entity_value': 'rheumatoid_arthritis'},
 {'date_id': 308323,
  'date_value': '23rd Oct 2024',
  'entity_id': 308276,
  'entity_value': 'headache'},
 {'date_id': 308325,
  'date_value': '17.12.24',
  'entity_id': 308313,
  'entity_value': 'GERD'}]

In [8]:
#Run across all documents and summarize counts
summary = []
for d in docs:
    summary.append({
        "doc_id": d.get("id"),
        "n_entities": len(get_validated_entities(d, DATE_CUI)),
        "n_dates": len(get_validated_dates(d, DATE_CUI)),
        "n_rel_dates": len(get_validated_relative_dates(d, RELATIVE_DATE_CUI)),
        "n_links": len(get_validated_links(d, DATE_CUI, RELATIVE_DATE_CUI)),
    })

pd.DataFrame(summary).sort_values("doc_id").reset_index(drop=True)

Unnamed: 0,doc_id,n_entities,n_dates,n_rel_dates,n_links
0,26461,64,6,0,4
1,26462,21,7,0,11
2,26463,15,7,0,12
3,26464,11,3,0,6
4,26465,24,3,0,4


Create Dataset & Add Relative Dates

In [9]:
#Extract validated entities, dates, relative dates and links from all docs in dataset and append to rows
rows = []
for doc in docs:
    # Extract entities, dates, relative dates and links
    ents = get_validated_entities(doc, DATE_CUI)
    dates = get_validated_dates(doc, DATE_CUI)
    relative_dates = get_validated_relative_dates(doc, RELATIVE_DATE_CUI)
    links = get_validated_links(doc, DATE_CUI, RELATIVE_DATE_CUI)  # FIX: Add relative_date_cui parameter

    # Mapping from validated items
    id2value = id2value_from_items(ents, dates, relative_dates)

    # Serialize each field
    entities_json = doc_to_entities_json(ents)
    dates_json = doc_to_dates_json(dates)
    relative_dates_json = doc_to_relative_dates_json(relative_dates)
    links_json = doc_to_links_value_json(links, id2value)

    # Assemble row
    rows.append(make_row(
        doc_id=doc.get("id"),
        note_text=doc.get("text", ""),
        entities_json=entities_json,
        dates_json=dates_json,
        relative_dates_json=relative_dates_json,
        links_json=links_json
    ))

In [10]:
#Create dataframe
df = pd.DataFrame(rows, columns=["doc_id", "note_text", "entities_json", "dates_json", "relative_dates_json", "links_json"])
df

Unnamed: 0,doc_id,note_text,entities_json,dates_json,relative_dates_json,links_json
0,26461,Ultrasound (30nd Jun 2024): no significant fin...,"[{""id"": 308244, ""value"": ""history of meningiti...","[{""id"": 308320, ""value"": ""30nd Jun 2024"", ""sta...",[],"[{""date"": ""12nd Sep 2024"", ""entity"": ""pituitar..."
1,26462,Labs (27th Sep 2024): anemia. resolving Skin:...,"[{""id"": 308371, ""value"": ""lesions"", ""cui"": ""52...","[{""id"": 308581, ""value"": ""22/11/24"", ""start"": ...",[],"[{""date"": ""27th Sep 2024"", ""entity"": ""anemia""}..."
2,26463,URGENT REVIEW (2024-10-04): cough. suspect ost...,"[{""id"": 308886, ""value"": ""frequent urination"",...","[{""id"": 308940, ""value"": ""2024-10-04"", ""start""...",[],"[{""date"": ""2024-10-04"", ""entity"": ""cough""}, {""..."
3,26464,URGENT REVIEW (13rd Feb 2025) MRI of the brain...,"[{""id"": 308951, ""value"": ""multiple_sclerosis"",...","[{""id"": 308996, ""value"": ""05-03-2025"", ""start""...",[],"[{""date"": ""13rd Feb 2025"", ""entity"": ""visual""}..."
4,26465,New pt((18/11/24)): pt presents with nausea/vo...,"[{""id"": 308998, ""value"": ""history of neoplasm ...","[{""id"": 309070, ""value"": ""18/11/24"", ""start"": ...",[],"[{""date"": ""18/11/24"", ""entity"": ""nausea/vomiti..."


In [11]:
#Save csv
out_path = "../data/medcat_trainer_dataset.csv"
df.to_csv(out_path, index=False)