#### To Do's
1. Parse strings to a better format [remove dates, other junk info]
    - LM/AI Services Extractor?
2. Error in the DF Iterrows
    - Fix Prompt Syntax / Harden recursion job - what to do if no code is found
3. Implement structured output

In [1]:
from src.prepare_mimic_iii import transform_data
from src.call_aoai import call_aoai
from src.tree import TaxonomyParser
from IPython.display import Image
from nltk import flatten
import pandas as pd
import ast
import functools
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)

https://medcode-aoai-useast.openai.azure.com/


### Data Prep

In [2]:
# df = transform_data("data/") # Only re-run if change in preparation logic
df = pd.read_csv("data/joined/dataset_single_001_279.csv.gz")
print(df.shape)
display(df.head(5))

(7648, 3)


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,100003,['Chief Complaint: HPI: 24 Hour Events: awaiti...,[70]
1,100009,"[""2162-5-20 2:57 PM CHEST PA & LAT Clip # Clip...","[272, 250, 278]"
2,100018,"[""Admission Date: 2176-8-29 Discharge Date: 21...","[225, 250, 276, 278]"
3,100041,['2140-12-9 9:27 AM CHEST PORTABLE AP Clip # 0...,"[274, 244]"
4,100046,['2183-10-23 5:31 AM CHEST SINGLE VIEW Clip # ...,"[275, 276]"


In [3]:
max_code = 88
slim_df = df[df['ICD9_CODE'].apply(lambda x: all(i < max_code for i in ast.literal_eval(x)))]
slim_df = slim_df[0:10]
display(slim_df.head())
print(slim_df.shape)

Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,100003,['Chief Complaint: HPI: 24 Hour Events: awaiti...,[70]
6,100074,"[""Admission Date: 2176-4-9 Discharge Date: 217...","[70, 38]"
10,100104,"[""Admission Date: 2201-6-21 Discharge Date: 22...",[38]
21,100307,"[""2161-10-29 3:35 PM ILIAC Clip # Clip Number ...",[41]
35,100498,"[""Admission Date: 2120-3-10 Discharge Date: 21...",[41]


(10, 3)


In [4]:
# Initialize Code Tree
code_tree = TaxonomyParser()
code_tree.read_from_json("icd9_tax.json")

print(code_tree.find_by_name("0"))

Node('/root/0', description='Infectious and Parasitic Diseases')


In [5]:
# View Tree
code_tree.visualize("0")

0        Infectious and Parasitic Diseases
├── 00   Intestinal infectious diseases
│   ├── 001 Cholera
│   ├── 002 Typhoid and paratyphoid fevers
│   ├── 003 Salmonella
│   ├── 004 Shigellosis
│   ├── 005 Other poisoning (bacterial)
│   ├── 006 Amebiasis
│   ├── 007 Other protozoal intestinal diseases
│   ├── 008 Intestinal infections due to other organisms
│   └── 009 Ill-defined intestinal infections
├── 01   Tuberculosis
│   ├── 010 Primary tuberculous infection
│   ├── 011 Pulmonary tuberculosis
│   ├── 012 Other respiratory tuberculosis
│   ├── 013 Tuberculosis of meninges and central nervous system
│   ├── 014 Tuberculosis of intestines, peritoneum, and mesenteric glands
│   ├── 015 Tuberculosis of bones and joints
│   ├── 016 Tuberculosis of genitourinary system
│   ├── 017 Tuberculosis of other organs
│   ├── 018 Miliary tuberculosis
│   └── 019 Respiratory tuberculosis unspecified
├── 02   Zoonotic bacterial diseases
│   ├── 020 Plague
│   ├── 021 Tularemia
│   ├── 022 Anthrax

### Get ICD9 Codes

In [6]:
# Dyamically Build Prompt

def get_options(tree, parent_code):
    children = tree.get_children(parent_code)
    options = []
    for child in children:
        options.append(f"{child.name}: {child.description}")
    
    return '|'.join(options)

def build_prompt(tree, parent_code, note, categories):
    sys = """
    You are a medical expert. Your job is to classify notes of an event into one or more categories.
    Choose the best option(s) based on the categories offered. ALWAYS return at least one index. ONLY choose from categories listed. 
    Respond with a list of quoted string indeces of the categories the note belongs to.
    Think through your answer. ACCURACY is VERY IMPORTANT to your job.
    
    ### EXAMPLE ###
    Categories = 0: Infectious and Parasitic Diseases | 1: Neoplasms | 2: Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders
    Note = Patient has Tuberculosis and an Immunity Disorder
    Answer: ['0','2']
    ## END EXAMPLE ##
    """
    
    
    prompt = f"""
    Categories = {categories}
    Note = {note}
    Answer:
    """

    return sys, prompt

In [7]:
# Recursive - Walk tree and call aoai to get codes

def get_codes_for_note(parent_code, tree, note):
    
    categories = get_options(tree, parent_code)
    sys, prompt = build_prompt(tree, parent_code, note, categories)

    codes = call_aoai(sys, prompt)
    
    logger.info(f"Parent Code: {parent_code} | Found: {codes}")
    logger.info(f"Prompt: {prompt}")

    if all(len(i) == 3 for i in codes):
        return codes
    else:
        return list(map(functools.partial(get_codes_for_note, tree=tree, note=note), codes))
    

In [8]:
### SIMPLE TEST ###

res = flatten(get_codes_for_note("root", code_tree, "Tuberculosis of the bones and joints and HIV"))
print(res)

#### END SIMPLE TEST ###

INFO:httpx:HTTP Request: POST https://medcode-aoai-useast.openai.azure.com//openai/deployments/gpt4o-mini-deploy/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:__main__:Parent Code: root | Found: ['0']
INFO:__main__:Prompt: 
    Categories = 0: Infectious and Parasitic Diseases
    Note = Tuberculosis of the bones and joints and HIV
    Answer:
    
INFO:httpx:HTTP Request: POST https://medcode-aoai-useast.openai.azure.com//openai/deployments/gpt4o-mini-deploy/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:__main__:Parent Code: 0 | Found: ['01', '04']
INFO:__main__:Prompt: 
    Categories = 00: Intestinal infectious diseases|01: Tuberculosis|02: Zoonotic bacterial diseases|03: Other bacterial diseases|04: Human immunodeficiency virus (HIV) infection|05: Poliomyelitis and other non-arthropod-borne viral diseases of central nervous system|06: Viral diseases accompanied by exanthem|07: Arthropod-borne viral diseases|08: Other diseases due to viruses and Chl

['015', '042', '044']


In [9]:
results = []
slim_df['Generated'] = ""
for index, row in slim_df.iterrows():

    # Parse Note
    note = ast.literal_eval(row['TEXT'])[0]

    # Get Codes
    result = flatten(get_codes_for_note("0", code_tree, note))

    # Add result to DF
    slim_df.at[index, 'Generated'] = str(result)

INFO:httpx:HTTP Request: POST https://medcode-aoai-useast.openai.azure.com//openai/deployments/gpt4o-mini-deploy/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:__main__:Parent Code: 0 | Found: ['00', '01', '03']
INFO:__main__:Prompt: 
    Categories = 00: Intestinal infectious diseases|01: Tuberculosis|02: Zoonotic bacterial diseases|03: Other bacterial diseases|04: Human immunodeficiency virus (HIV) infection|05: Poliomyelitis and other non-arthropod-borne viral diseases of central nervous system|06: Viral diseases accompanied by exanthem|07: Arthropod-borne viral diseases|08: Other diseases due to viruses and Chlamydiota|09: Rickettsioses and other arthropod-borne diseases
    Note = Chief Complaint: HPI: 24 Hour Events: awaiting bed on liver service stable no ssx bleeding Allergies: No Known Drug Allergies Last dose of Antibiotics: Ceftriaxone - 2150-4-18 10:02 PM Vancomycin - 2150-4-19 08:33 AM Infusions: Other ICU medications: Pantoprazole Protonix - 2150-4-18 08:2

AttributeError: 'NoneType' object has no attribute 'children'

In [10]:
display(slim_df.head(20))

Unnamed: 0,HADM_ID,TEXT,ICD9_CODE,Generated
0,100003,['Chief Complaint: HPI: 24 Hour Events: awaiti...,[70],"['009', '040']"
6,100074,"[""Admission Date: 2176-4-9 Discharge Date: 217...","[70, 38]","['010', '014', '017', '038', '040', '041', '04..."
10,100104,"[""Admission Date: 2201-6-21 Discharge Date: 22...",[38],"['001', '040']"
21,100307,"[""2161-10-29 3:35 PM ILIAC Clip # Clip Number ...",[41],[]
35,100498,"[""Admission Date: 2120-3-10 Discharge Date: 21...",[41],"['011', '018', '038', '041', '043', '044']"
38,100536,"[""2177-2-14 12:55 PM CHEST PORTABLE AP Clip # ...","[42, 70]","['042', '043']"
55,100746,"[""Chief Complaint: Hematemesis and fevers HPI:...",[38],"['019', '040']"
61,100810,"[""Admission Date: 2171-7-16 Discharge Date: 21...",[41],
72,100942,"[""2128-8-11 11:06 AM CT ABDOMEN WCONTRAST CT P...",[41],
87,101119,['2179-4-21 1:19 PM CT HEAD WO CONTRAST Clip #...,[70],


### Score Results

In [11]:
def format_icd9(x):
    new_codes = []
    code_list = ast.literal_eval(x)
    for code in code_list:
        new_codes.append(f"{code:0>3}".format(num="1"))

    return str(new_codes)

def recall_score(truth, generated):
    actual_list = ast.literal_eval(truth)
    generated_list = ast.literal_eval(generated)

    similar = len(set(actual_list) & set(generated_list))

    return similar / len(actual_list)

def precision_score(truth, generated):
    actual_list = ast.literal_eval(truth)
    generated_list = ast.literal_eval(generated)

    similar = len(set(actual_list) & set(generated_list))

    return similar / len(generated_list)

In [13]:
results = pd.DataFrame()
results = slim_df.copy().dropna()

results['ICD9_CODE'] = results['ICD9_CODE'].apply(format_icd9)
display(results.head(20))
results['Recall'] = results.apply(lambda x: recall_score(x['ICD9_CODE'], x['Generated']), axis=1)
results['Precision'] = results.apply(lambda x: precision_score(x['ICD9_CODE'], x['Generated']), axis=1)

display(results.head(20))


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE,Generated
0,100003,['Chief Complaint: HPI: 24 Hour Events: awaiti...,['070'],"['009', '040']"
6,100074,"[""Admission Date: 2176-4-9 Discharge Date: 217...","['070', '038']","['010', '014', '017', '038', '040', '041', '04..."
10,100104,"[""Admission Date: 2201-6-21 Discharge Date: 22...",['038'],"['001', '040']"
21,100307,"[""2161-10-29 3:35 PM ILIAC Clip # Clip Number ...",['041'],[]
35,100498,"[""Admission Date: 2120-3-10 Discharge Date: 21...",['041'],"['011', '018', '038', '041', '043', '044']"
38,100536,"[""2177-2-14 12:55 PM CHEST PORTABLE AP Clip # ...","['042', '070']","['042', '043']"
55,100746,"[""Chief Complaint: Hematemesis and fevers HPI:...",['038'],"['019', '040']"
61,100810,"[""Admission Date: 2171-7-16 Discharge Date: 21...",['041'],
72,100942,"[""2128-8-11 11:06 AM CT ABDOMEN WCONTRAST CT P...",['041'],
87,101119,['2179-4-21 1:19 PM CT HEAD WO CONTRAST Clip #...,['070'],


SyntaxError: unexpected EOF while parsing (<unknown>, line 0)