#### To Do's
1. Check out PubMed Embeddings or Fill Mask Model
    - L1 _only_ grading system
2. Implement Structured Outputs
3. Parse strings to a better format [remove dates, other junk info]
    - LM/AI Services Extractor?
4. Error in the DF Iterrows
    - Fix Prompt Syntax / Harden recursion job - what to do if no code is found

In [62]:
from src.prepare_mimic_iii import transform_data
from src.call_aoai import call_aoai
from src.tree import TaxonomyParser
from IPython.display import Image
from nltk import flatten
import pandas as pd
import ast
import functools
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)

pd.set_option('display.max_colwidth', None)

### Data Prep

In [63]:
# df = transform_data("data/") # Only re-run if change in preparation logic
df = pd.read_csv("data/joined/dataset_single_001_088.csv.gz")
print(df.shape)
display(df.dtypes)

(4855, 3)


HADM_ID       int64
TEXT         object
ICD9_CODE    object
dtype: object

In [64]:
# Get L1 and L2 codes for grading purposes

def get_parent_codes(code_tree, codes):
    code_list = ast.literal_eval(codes)
    parent_codes = []
    for code in code_list:
        parent_codes.append(code_tree.find_by_name(code).parent.name)
    
    parent_codes = list(set(parent_codes))
    return str(parent_codes)

df['L2_CODES'] = df['ICD9_CODE'].apply(lambda x: get_parent_codes(code_tree, x))
df['L1_CODES'] = df['L2_CODES'].apply(lambda x: get_parent_codes(code_tree, x))
display(df[['ICD9_CODE', 'L2_CODES', 'L1_CODES']].head(5))

Unnamed: 0,ICD9_CODE,L2_CODES,L1_CODES
0,['041'],['03'],['0']
1,"['038', '070']","['08', '03']",['0']
2,['041'],['03'],['0']
3,['038'],['03'],['0']
4,['038'],['03'],['0']


In [65]:
slim_df = df[0:10]
print(slim_df.shape)

(10, 5)


In [66]:
# Initialize Code Tree
code_tree = TaxonomyParser()
code_tree.read_from_json("icd9_tax.json")

print(code_tree.find_by_name("00"))

Node('/root/0/00', description='Intestinal infectious diseases')


In [67]:
# View Tree
code_tree.visualize("0")

0        Infectious and Parasitic Diseases
├── 00   Intestinal infectious diseases
│   ├── 001 Cholera
│   ├── 002 Typhoid and paratyphoid fevers
│   ├── 003 Salmonella
│   ├── 004 Shigellosis
│   ├── 005 Other poisoning (bacterial)
│   ├── 006 Amebiasis
│   ├── 007 Other protozoal intestinal diseases
│   ├── 008 Intestinal infections due to other organisms
│   └── 009 Ill-defined intestinal infections
├── 01   Tuberculosis
│   ├── 010 Primary tuberculous infection
│   ├── 011 Pulmonary tuberculosis
│   ├── 012 Other respiratory tuberculosis
│   ├── 013 Tuberculosis of meninges and central nervous system
│   ├── 014 Tuberculosis of intestines, peritoneum, and mesenteric glands
│   ├── 015 Tuberculosis of bones and joints
│   ├── 016 Tuberculosis of genitourinary system
│   ├── 017 Tuberculosis of other organs
│   ├── 018 Miliary tuberculosis
│   └── 019 Respiratory tuberculosis unspecified
├── 02   Zoonotic bacterial diseases
│   ├── 020 Plague
│   ├── 021 Tularemia
│   ├── 022 Anthrax

### Get ICD9 Codes

In [68]:
# Dyamically Build Prompt

def get_options(tree, parent_code):
    children = tree.get_children(parent_code)
    options = []
    for child in children:
        options.append(f"{child.name}: {child.description}")
    
    return '|'.join(options)

def build_prompt(tree, parent_code, note, categories):
    sys = """
    You are a medical expert. Your job is to classify notes of an event into one or more categories.
    Choose the best option(s) based on the categories offered. ALWAYS return at least one index. ONLY choose from categories listed. 
    Respond with a list of quoted string indeces of the categories the note belongs to.
    Think through your answer. ACCURACY is VERY IMPORTANT to your job.
    
    ### EXAMPLE ###
    Categories = 0: Infectious and Parasitic Diseases | 1: Neoplasms | 2: Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders
    Note = Patient has Tuberculosis and an Immunity Disorder
    Answer: ['0','2']
    ## END EXAMPLE ##
    """
    
    
    prompt = f"""
    Categories = {categories}
    Note = {note}
    Answer:
    """

    return sys, prompt

In [69]:
# Recursive - Walk tree and call aoai to get codes

def get_codes_for_note(parent_code, tree, note):
    
    categories = get_options(tree, parent_code)
    sys, prompt = build_prompt(tree, parent_code, note, categories)

    codes = call_aoai(sys, prompt)
    
    logger.info(f"Parent Code: {parent_code} | Found: {codes}")
    logger.info(f"Prompt: {prompt}")

    if all(len(i) == 2 for i in codes):
        return codes
    else:
        return list(map(functools.partial(get_codes_for_note, tree=tree, note=note), codes))
    

In [70]:
### SIMPLE TEST ###
'''
res = flatten(get_codes_for_note("root", code_tree, "Tuberculosis of the bones and joints and HIV"))
print(res)
'''

#### END SIMPLE TEST ###

'\nres = flatten(get_codes_for_note("root", code_tree, "Tuberculosis of the bones and joints and HIV"))\nprint(res)\n'

In [71]:
results = []
slim_df['Generated'] = ""
for index, row in slim_df.iterrows():

    # Parse Note
    note = ast.literal_eval(row['TEXT'])[0]

    # Get Codes
    result = flatten(get_codes_for_note("0", code_tree, note))

    # Add result to DF
    slim_df.at[index, 'Generated'] = str(result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slim_df['Generated'] = ""
INFO:httpx:HTTP Request: POST https://medcode-aoai-useast.openai.azure.com//openai/deployments/gpt4o-mini-deploy/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:__main__:Parent Code: 0 | Found: ['00', '01', '03']
INFO:__main__:Prompt: 
    Categories = 00: Intestinal infectious diseases|01: Tuberculosis|02: Zoonotic bacterial diseases|03: Other bacterial diseases|04: Human immunodeficiency virus (HIV) infection|05: Poliomyelitis and other non-arthropod-borne viral diseases of central nervous system|06: Viral diseases accompanied by exanthem|07: Arthropod-borne viral diseases|08: Other diseases due to viruses and Chlamydiota|09: Rickettsioses and other arthropod-borne diseases
    Note 

In [72]:
display(slim_df[['ICD9_CODE','L1_CODES','L2_CODES', 'Generated']].head(10))

Unnamed: 0,ICD9_CODE,L1_CODES,L2_CODES,Generated
0,['041'],['0'],['03'],"['00', '01', '03']"
1,"['038', '070']",['0'],"['08', '03']","['00', '03', '04']"
2,['041'],['0'],['03'],"['01', '03', '04']"
3,['038'],['0'],['03'],['00']
4,['038'],['0'],['03'],[]
5,['053'],['0'],['06'],"['01', '00', '04', '03', '08']"
6,['038'],['0'],['03'],['01']
7,"['047', '038']",['0'],"['03', '05']",['00']
8,"['041', '038']",['0'],['03'],"['01', '03']"
9,['038'],['0'],['03'],"['01', '03', '04', '09']"


### Score Results

In [74]:
# SCoring Functions

def format_icd9(x):
    new_codes = []
    code_list = ast.literal_eval(x)
    for code in code_list:
        new_codes.append(f"{code:0>3}".format(num="1"))

    return str(new_codes)

def recall_score(truth, generated):
    actual_list = ast.literal_eval(truth)
    generated_list = ast.literal_eval(generated)

    similar = len(set(actual_list) & set(generated_list))

    return similar / len(actual_list)

def precision_score(truth, generated):
    actual_list = ast.literal_eval(truth)
    generated_list = ast.literal_eval(generated)

    if len(generated_list) == 0:
        return 0

    similar = len(set(actual_list) & set(generated_list))

    return similar / len(generated_list)

def f1_score(truth, generated):
    precision = precision_score(truth, generated)
    recall = recall_score(truth, generated)

    if precision + recall == 0:
        return 0
    else:
        return 2 * (precision * recall) / (precision + recall)

#### Grade L2 Output

In [85]:
results = pd.DataFrame()

results['ICD9_CODE'] = slim_df['ICD9_CODE'].apply(format_icd9)
results['Recall'] = slim_df.apply(lambda x: recall_score(x['L2_CODES'], x['Generated']), axis=1)
results['Precision'] = slim_df.apply(lambda x: precision_score(x['L2_CODES'], x['Generated']), axis=1)
results['F1 Score'] = slim_df.apply(lambda x: f1_score(x['L2_CODES'], x['Generated']), axis=1)
display(results[['Recall', 'Precision', 'F1 Score']].mean(axis=0)*100)

Recall       45.000000
Precision    17.500000
F1 Score     24.666667
dtype: float64

#### Grade Final Output

In [78]:
results = pd.DataFrame()
results = slim_df.copy().dropna()

results['ICD9_CODE'] = slim_df['ICD9_CODE'].apply(format_icd9)
results['Recall'] = slim_df.apply(lambda x: recall_score(x['ICD9_CODE'], x['Generated']), axis=1)
results['Precision'] = slim_df.apply(lambda x: precision_score(x['ICD9_CODE'], x['Generated']), axis=1)
results['F1 Score'] = slim_df.apply(lambda x: f1_score(x['ICD9_CODE'], x['Generated']), axis=1)

#### Results Summary

In [None]:
print(f"Recall = {round(results['Recall'].mean(),2)}")
print(f"Precision = {round(results['Precision'].mean(),2)}")