In [1]:
# Imports
import pandas as pd
import json
import datefinder
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.config import Config

import sys
import os
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from date_extractor import add_relative_dates, extract_absolute_dates

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Data Loading
df = pd.read_csv("../data/synthetic_trainer.csv")
print(f"Loaded {len(df)} records")

Loaded 101 records


In [3]:
# Load pre-trained MedCAT model (you'll need to provide the path)
medcat_model_path = "../models/medmen_wstatus_2021_oct/"  # Update this path
cat = CAT.load_model_pack(medcat_model_path)



In [4]:
# Process each document
results = []

for idx, row in df.iterrows():
    doc_id = row['name']  # Using 'name' as doc_id
    text = row['text']
    
    # Extract entities using MedCAT
    doc = cat(text)
    entities = []
    for ent in doc.ents:
        entities.append({
            'id': f"ent_{len(entities) + 1}",
            'value': ent.text,
            'cui': ent._.cui,
            'start': ent.start_char,
            'end': ent.end_char
        })
    
    # Extract absolute dates using datefinder
    dates = extract_absolute_dates(text)
    
    # Add relative dates
    relative_dates = add_relative_dates(pd.DataFrame([{'note_text': text}])).iloc[0]['relative_dates_json']
    
    # Create result row
    results.append({
        'doc_id': doc_id,
        'note_text': text,
        'entities_json': json.dumps(entities),
        'dates_json': json.dumps(dates),
        'relative_dates_json': relative_dates
    })

In [10]:
#Conver to df and inspect
inference_df = pd.DataFrame(results)
print(f"Created inference dataset with {len(inference_df)} records")
inference_df

Created inference dataset with 101 records


Unnamed: 0,doc_id,note_text,entities_json,dates_json,relative_dates_json
0,0,Ultrasound (30nd Jun 2024): no significant fin...,"[{""id"": ""ent_1"", ""value"": ""Ultrasound"", ""cui"":...","[{""id"": ""abs_1"", ""value"": ""30nd Jun 2024"", ""st...",[]
1,1,Labs (27th Sep 2024): anemia. resolving Skin:...,"[{""id"": ""ent_1"", ""value"": ""anemia"", ""cui"": ""C0...","[{""id"": ""abs_1"", ""value"": ""27th Sep 2024"", ""st...",[]
2,2,URGENT REVIEW (2024-10-04): cough. suspect ost...,"[{""id"": ""ent_1"", ""value"": ""REVIEW"", ""cui"": ""C1...","[{""id"": ""abs_1"", ""value"": ""2024-10-04"", ""start...",[]
3,3,URGENT REVIEW (13rd Feb 2025) MRI of the brain...,"[{""id"": ""ent_1"", ""value"": ""REVIEW"", ""cui"": ""C0...","[{""id"": ""abs_1"", ""value"": ""13rd Feb 2025"", ""st...",[]
4,4,New pt((18/11/24)): pt presents with nausea/vo...,"[{""id"": ""ent_1"", ""value"": ""nausea"", ""cui"": ""C0...","[{""id"": ""abs_1"", ""value"": ""18/11/24"", ""start"":...",[]
...,...,...,...,...,...
96,96,Visit((08/10/24)): pt presents with joint pain...,"[{""id"": ""ent_1"", ""value"": ""Visit"", ""cui"": ""C05...","[{""id"": ""abs_1"", ""value"": ""08/10/24"", ""start"":...",[]
97,97,F/U (31 Aug 2024): resolved A review of system...,"[{""id"": ""ent_1"", ""value"": ""resolved"", ""cui"": ""...","[{""id"": ""abs_1"", ""value"": ""31 Aug 2024"", ""star...",[]
98,98,Phone note((12-10-2024)): slightly improved. o...,"[{""id"": ""ent_1"", ""value"": ""improved"", ""cui"": ""...","[{""id"": ""abs_1"", ""value"": ""12-10-2024"", ""start...",[]
99,99,F/U (2025-02-23): fluctuating. confirmed multi...,"[{""id"": ""ent_1"", ""value"": ""fluctuating"", ""cui""...","[{""id"": ""abs_1"", ""value"": ""2025-02-23"", ""start...",[]


In [11]:
# Save csv
inference_df.to_csv("../data/inference_dataset.csv", index=False)