Imports

In [None]:
# Imports
import pandas as pd
import json
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.config import Config

import sys
import os
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from date_extractor_utils import extract_absolute_dates, add_relative_dates

Data Loading

In [None]:
# Data Loading
df = pd.read_csv("../data/data.csv")
print(f"Loaded {len(df)} records")

In [None]:
#Inspect df
df.head()

MedCAT Model

In [None]:
# Load pre-trained MedCAT model (you'll need to provide the path)
medcat_model_path = "../models/20230227__kch_gstt_trained_model_494c3717f637bb89/"  # Update this path
cat = CAT.load_model_pack(medcat_model_path)

In [None]:
# Comprehensive entity type analysis
"""

print("=== Entity Type Analysis ===")

# 1. First collect all unique type IDs from actual entities
type_id_stats = {}
type_examples = {}

# Process a sample of documents
sample_size = min(100, len(df))
for idx in range(sample_size):
    doc = cat(df.iloc[idx]['note_text'])  # Changed from 'text' to 'note_text'
    for ent in doc.ents:
        if ent._.cui != -1:
            type_ids = cat.cdb.cui2type_ids.get(ent._.cui, set())
            for type_id in type_ids:
                if type_id not in type_id_stats:
                    type_id_stats[type_id] = {
                        'count': 0,
                        'cuis': set(),
                        'examples': []
                    }
                type_id_stats[type_id]['count'] += 1
                type_id_stats[type_id]['cuis'].add(ent._.cui)
                if len(type_id_stats[type_id]['examples']) < 3:
                    type_id_stats[type_id]['examples'].append({
                        'text': ent.text,
                        'cui': ent._.cui,
                        'preferred_name': cat.cdb.cui2preferred_name.get(ent._.cui)
                    })

# Print results
print("\nType ID Statistics:")
for type_id, stats in sorted(type_id_stats.items(), key=lambda x: x[1]['count'], reverse=True):
    print(f"\nType ID: {type_id}")
    print(f"Count: {stats['count']}")
    print(f"Unique CUIs: {len(stats['cuis'])}")
    print("Examples:")
    for ex in stats['examples']:
        print(f"  - '{ex['text']}' (CUI: {ex['cui']}, Preferred: {ex['preferred_name']})")

# Also check what other CDB properties might help with type identification
print("\nRelevant CDB Properties:")
for prop in dir(cat.cdb):
    if any(term in prop.lower() for term in ['type', 'category', 'class', 'group']):
        print(f"- {prop}")

"""

In [None]:
# Analyze all CUIs and their preferred names
"""
print("=== CUI Analysis ===")

cui_stats = {}
for idx in range(len(df)):
    doc = cat(df.iloc[idx]['note_text'])
    for ent in doc.ents:
        if ent._.cui != -1:
            if ent._.cui not in cui_stats:
                cui_stats[ent._.cui] = {
                    'preferred_name': cat.cdb.cui2preferred_name.get(ent._.cui),
                    'type_ids': cat.cdb.cui2type_ids.get(ent._.cui, set()),
                    'count': 0,
                    'example_texts': set()
                }
            cui_stats[ent._.cui]['count'] += 1
            cui_stats[ent._.cui]['example_texts'].add(ent.text)

# Print results sorted by frequency
print("\nCUI Statistics (sorted by frequency):")
print("Format: CUI | Type IDs | Preferred Name | Count | Example Texts")
print("-" * 100)

for cui, stats in sorted(cui_stats.items(), key=lambda x: x[1]['count'], reverse=True):
    examples = list(stats['example_texts'])[:3]  # Show up to 3 examples
    print(f"{cui} | {stats['type_ids']} | {stats['preferred_name']} | {stats['count']} | {examples}")

"""

In [None]:
# Define CUIs for specific medical concepts
RELEVANT_CUIS = {
    # Diagnoses
    '254956000': 'Pituitary adenoma',
    '254965007': 'Pituitary macroadenoma',
    '34486009': 'Hyperthyroidism',
    '40930008': 'Hypothyroidism',
    '47270006': "Cushing's syndrome",
    '237785004': 'Hypoadrenalism',
    '74107003': 'Acromegaly',
    '34337008': 'Prolactinoma',
    '134209002': 'Prolactinoma',
    '230690007': 'Cerebrovascular accident',
    '274100004': 'Cerebral hemorrhage',
    '84757009': 'Epilepsy',
    '24700007': 'Multiple sclerosis',
    '49049000': "Parkinson's disease",
    '13746004': 'Bipolar disorder',
    '35489007': 'Depressive disorder',
    '58214004': 'Schizophrenia',
    '193031009': 'Cluster headache syndrome',
    '398057008': 'Tension-type headache',
    '37796009': 'Migraine',
    '398254007': 'Pre-eclampsia',
    '46764007': 'Severe pre-eclampsia',
    '38341003': 'Hypertensive disorder',
    '73211009': 'Diabetes mellitus',
    '44054006': 'Diabetes mellitus type 2',
    '55822004': 'Hyperlipidemia',
    '271737000': 'Anemia',
    '195967001': 'Asthma',
    '13645005': 'COPD',
    '32398004': 'Bronchitis',
    '233604007': 'Pneumonia',
    '235595009': 'GERD',
    '34000006': "Crohn's disease",
    '64766004': 'Ulcerative colitis',
    '10743008': 'Irritable bowel syndrome',
    '396275006': 'Osteoarthritis',
    '69896004': 'Rheumatoid arthritis',
    '203082005': 'Fibromyalgia',
    '90560007': 'Gout',
    '73430006': 'Sleep apnea',
    '193462001': 'Insomnia',
    '36760000': 'Hepatosplenomegaly',
    '81308009': 'Disorder of brain',
    '230745008': 'Hydrocephalus',
    '77157004': 'Disorder of optic nerve',
    '14415006': 'Pneumocephalus',
    '109989006': 'Multiple myeloma',
    '118600007': 'Malignant lymphoma',
    '93143009': 'Leukemia',
    '1162768007': 'Leukemia',
    '441457006': 'Cyst',
    '7180009': 'Meningitis',
    '399244003': 'Disorder of pituitary gland',
    '127024001': 'Neoplasm of pituitary gland',
    '108369006': 'Neoplasm',
    '239953001': 'Soft tissue lesion',
    '302820008': 'Intracranial meningioma',
    '235856003': 'Disorder of liver',
    '49436004': 'Atrial fibrillation',
    '37109004': 'Ebola virus disease',
    '233964008': 'Internal carotid artery stenosis',
    '76107001': 'Spinal stenosis',
    '3855007': 'Disorder of pancreas',
    '14304000': 'Disorder of thyroid gland',
    '6142004': 'Influenza',
    '49601007': 'Disorder of cardiovascular system',
    '94381002': 'Secondary malignant neoplasm of liver',
    '200938002': 'Discoid lupus erythematosus',
    '416209007': 'Synovitis',
    '409623005': 'Respiratory insufficiency',
    '54150009': 'Upper respiratory infection',
    '409609008': 'Radiologic infiltrate of lung',
    '197480006': 'Anxiety disorder',
    '80394007': 'Hyperglycemia',
    '74474003': 'Gastrointestinal hemorrhage',
    '276654001': 'Congenital malformation',
    '281647001': 'Adverse reaction',
    '172697005': 'Acute ulcerative gingivitis',
    '19829001': 'Disorder of lung',
    '58184002': 'Recurrent disease',
    
    # Procedures
    '312250003': 'MRI',
    '168537006': 'X-ray',
    '399208008': 'Chest X-ray',
    '77477000': 'CT',
    '303653007': 'CT head',
    '82918005': 'PET scan',
    '278292003': 'Ultrasound',
    '29303009': 'ECG',
    '26604007': 'CBC',
    '371361000119107': 'Comprehensive metabolic panel',
    '61167004': 'TSH test',
    '27171005': 'Urinalysis',
    '37931006': 'Auscultation',
    '18629005': 'Medication administration',
    '363679005': 'Imaging',
    '410617001': 'Adjustment',
    '32485007': 'Hospital admission',
    '410538000': 'Scheduling',
    '373205008': 'Nuclear medicine imaging',
    '260222006': 'SPECT',
    '169413002': 'Hormone therapy',
    '172043006': 'Simple mastectomy',
    '113028003': 'ENT examination',
    '392010000': 'Optical coherence tomography',
    '53950000': 'Respiratory therapy',
    '1031081000000108': 'Liver function tests',
    '722138006': 'Physiotherapy',
    '265414003': 'Abdominoperineal resection',
    
    # Medications
    '386873009': 'Lisinopril',
    '372567009': 'Metformin',
    '373444002': 'Atorvastatin',
    '386979007': 'Cabergoline',
    '396458002': 'Hydrocortisone',
    '387207008': 'Ibuprofen',
    '387458008': 'Aspirin',
    '386844006': 'Topiramate',
    '116602009': 'Prednisone',
    '372726002': 'Amitriptyline',
    '372767007': 'Fluoxetine',
    '395892000': 'Sumatriptan',
    '67866001': 'Insulin',
    '373265006': 'Analgesic',
    
    # Symptoms/Findings
    '16932000': 'Nausea and vomiting',
    '422587007': 'Nausea',
    '422400008': 'Vomiting',
    '249497008': 'Vomiting symptom',
    '267036007': 'Dyspnea',
    '267060006': 'Diarrhea symptom',
    '62315008': 'Diarrhea',
    '25064002': 'Headache',
    '29857009': 'Chest pain',
    '161891005': 'Backache',
    '57676002': 'Joint pain',
    '21522001': 'Abdominal pain',
    '22253000': 'Pain',
    '48694002': 'Anxiety',
    '366979004': 'Depressed mood',
    '18963009': 'Mood swings',
    '84229001': 'Fatigue',
    '386661006': 'Fever',
    '274640006': 'Fever with chills',
    '42984000': 'Night sweats',
    '8943002': 'Weight gain',
    '89362005': 'Weight loss',
    '404640003': 'Dizziness',
    '300471006': 'Urinary frequency',
    '17173007': 'Excessive thirst',
    '56018004': 'Wheezing',
    '48409008': 'Respiratory crackles',
    '24612001': 'Wheeze - rhonchi',
    '414786004': 'Murmur',
    '64730000': 'Normal sinus rhythm',
    '72970002': 'Normal vital signs',
    '119971000119104': 'Elevated CRP',
    '166603001': 'Abnormal LFTs',
    '414478003': 'Elevated WBC',
    '80394007': 'Hyperglycemia',
    '300848003': 'Mass',
    '1806006': 'Eruption',
    '271807003': 'Eruption of skin',
    '52988006': 'Lesion',
    '13331008': 'Atrophy',
    '47351003': 'Infiltration',
    '41699000': 'Effusion',
    '36358004': 'Abnormal cardiac conduction',
    '69328002': 'Distress',
    '297982009': 'Skin problem',
    '418970005': 'Pupil equal round and reacting to light',
    '127199000': 'Inguinal lymphadenopathy',
    '7036007': 'Pericardial friction rub',
    '162274004': 'Visual symptoms',
    '131148009': 'Bleeding',
    '300439004': 'Gastrointestinal tract problem',
    '6081001': 'Deformity',
    '87828008': 'Insufficiency',
    '415582006': 'Stenosis',
    '404220000': 'Bladder irritability',
    '42341009': 'Agnosia',
    '49727002': 'Cough',
    '263731006': 'Coughing'
}

Add Entities & Dates

In [None]:
# Process each document
results = []

for idx, row in df.iterrows():
    doc_id = row['doc_id']
    text = row['note_text']
    
    # Extract entities using MedCAT
    doc = cat(text)
    entities = []
    for ent in doc.ents:
        # Only include entities with valid CUIs that are in our whitelist
        if ent._.cui != -1 and ent._.cui in RELEVANT_CUIS:
            entities.append({
                'id': f"ent_{len(entities) + 1}",
                'value': ent.text,
                'preferred_name': RELEVANT_CUIS[ent._.cui],  # Add preferred name from our whitelist
                'cui': ent._.cui,
                'start': ent.start_char,
                'end': ent.end_char
            })
    
    # Extract absolute dates using datefinder
    dates = extract_absolute_dates(text)
    
    # Add relative dates
    relative_dates = add_relative_dates(pd.DataFrame([{'note_text': text}])).iloc[0]['relative_dates_json']
    
    # Create result row
    results.append({
        'doc_id': doc_id,
        'note_text': text,
        'entities_json': json.dumps(entities),
        'dates_json': json.dumps(dates),
        'relative_dates_json': relative_dates
    })

In [None]:
#Conver to df
inference_df = pd.DataFrame(results)
print(f"Created inference dataset with {len(inference_df)} records")

In [None]:
#Inspect df
inference_df.head()

In [None]:
# Save csv
inference_df.to_csv("../data/inference_dataset.csv", index=False)