In [None]:
from code_utils import llm_coder as llmc
from langchain_openai import ChatOpenAI
import os
import json
import pandas as pd
import openai
import numpy as np

proj_path = os.getcwd().replace("\\", "/") 

In [11]:
LLM = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    request_timeout=45,
    max_retries=3,
)
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key is None:
    raise ValueError("OPENAI_API_KEY is not set. Check .env file..")

df = pd.read_pickle(f"{proj_path}/data/altered_data/prepared_data.pkl")
with open(f"{proj_path}/data/definitions/icd_code_mapping.json", "r") as file:
    icd_code_mapping = json.load(file)


In [73]:
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.schema import SystemMessage
from pydantic.v1 import BaseModel, Field
from typing import List, Dict
import pandas as pd
import tiktoken
import logging
import os


embedding_encoding = "cl100k_base"
encoding = tiktoken.get_encoding(embedding_encoding)

def truncate_text(text, max_tokens=15000):
    """
    Truncates the input text to ensure it does not exceed the specified maximum number of tokens.

    Input:
        text (str): The text to be truncated.
        max_tokens (int): The maximum number of tokens allowed in the output text. Default is 15000.

    Output:
        str: The truncated text if the original exceeds the token limit, otherwise returns the original text.

    This function encodes the input text into tokens and checks whether the number of tokens exceeds the specified limit.
    If so, it returns the text truncated to the maximum allowed tokens; otherwise, it returns the original text.
    """
    
    # Encode the input text into tokens, truncate it if it exceeds max_tokens, and then decode it back to text.
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        return encoding.decode(tokens[:max_tokens])
    return text

class Admission(BaseModel):
    """
    A model representing a patient's hospital admission, including the assigned ICD code and predictive reasoning.

    Input:
        admission_id (str): The unique identifier for the patient's admission.
        icd_code (str): The ICD diagnosis code assigned to the patient's visit.
        predictive_reasoning (str): The reasoning behind the predicted ICD code.

    This class is used to encapsulate information related to a hospital admission, such as the admission ID, the ICD diagnosis code, and an explanation for the diagnosis. 
    It helps maintain structure and consistency for patient admission records.
    """

    admission_id: str = Field(description="The ADMISSION ID of the patients visit")
    icd_code: str = Field(description="The diagnosis code assigned to the patients visit using the ICD definitions provided. a python string consisting of the diagnosis code (Only the ICD code e.g. B12.8  & not the definition)")
    predictive_reasoning: str = Field(description="The reason as to why the ICD code prediction was made.")

class Patient(BaseModel):
    """
    Represents a patient, containing information on multiple admissions.

    Input:
        __root__ (Dict[str, List[Admission]]): A dictionary where keys are patient identifiers and values are lists of 
                                               admissions related to the corresponding patient.

    Output:
        None: This is a data model and does not produce any outputs by itself.

    This model provides a structured representation of a patient that includes all of their admissions. It uses the
    Pydantic BaseModel to enforce type checking, ensuring that the admission data is well-formed and consistent.
    """

    __root__: Dict[str, List[Admission]]

def convert_dataframe_to_text(df):
    """
    Converts a DataFrame containing patient information into a nested dictionary format.

    Input:
        df (pd.DataFrame): A DataFrame containing patient records, including demographics, lab events, 
                        microbiology tests, pharmacy prescriptions, and admission details.

    Output:
        dict: A nested dictionary containing the information of each patient and their respective admissions.

    This function performs the following tasks:
        1. Converts textual columns to truncated form using a custom truncation function.
        2. Iterates over rows in the DataFrame to build a dictionary of patient data.
        3. Constructs detailed patient descriptions for demographics, lab events, microbiology tests, and pharmacy prescriptions.
    """
    logging.info("Commencing dataframe conversion to text")

    # Selecting text columns and truncating text data for better presentation
    text_columns = df.select_dtypes("object").columns.tolist()
    for col in text_columns:
        df[col] = df[col].apply(lambda x: truncate_text(x)).tolist()

    # Creating a nested dictionary for each subject, with detailed admission information
    all_subjects = {}
    for index, row in df.iterrows():
        # Constructing detailed text descriptions for patient demographics, lab events, microbiology, and pharmacy details
        subject_information = f"""Patient is a {"male" if row.is_male == 1 else "female"}, {row.anchor_age} years old."""
        labevents = f"The patient has occurred in {0 if pd.isna(row.labevents_count) else int(row.labevents_count)} laboratory events of which {0 if pd.isna(row.abnormal_events_count) else int(row.abnormal_events_count)} were abnormal. The following were commented on the patient's laboratory events::: {row.labevents_details}"
        microbiology = f"The patient has had {0 if pd.isna(row.microevent_count) else int(row.microevent_count)} microbiology tests. The following comments on the tests are as follows::: {row.microbiology_details} "
        pharmacy = f"The patient has been prescribed the following medicines by the pharmacy: {0 if pd.isna(row.pharmacy_count) else int(row.pharmacy_count)}. In more details::: {row.pharmacy_details}"

        # Building or updating the patient dictionary with detailed admission data
        if row.name[0] not in all_subjects.keys():            
            all_subjects[row.name[0]] = {
                "subject_details": subject_information,
                "admissions": [
                    {row.name[1]: {
                        "admission_details":row.admission_details,
                        "drg_details":row.drg_details,
                        "medication_details":row.medication_details,
                        "labevents_details":labevents,
                        "microbiology_details":microbiology,
                        "pharmacy_details":pharmacy,
                    }}
                    ]  
            }
        else:
            all_subjects[row.name[0]]["admissions"].append({
                row.name[1]: {
                        "admission_details":row.admission_details,
                        "drg_details":row.drg_details,
                        "medication_details":row.medication_details,
                        "labevents_details":labevents,
                        "microbiology_details":microbiology,
                        "pharmacy_details":pharmacy,
                    }
                })
    
    return all_subjects

def get_icd_llm(llm, training_sample, icd_definitions, subject_id, subject_details, admission_id, admission_details, drg_details, medication_details, labevents_details, microbiology_details, pharmacy_details):
    """
    Uses a language model (LLM) to predict the appropriate ICD diagnosis codes based on patient details and 
    associated clinical information, adhering strictly to predefined ICD definitions.

    Input:
        llm: A language model instance that supports structured output (LLM).
        training_sample (str): A sample input demonstrating the format for ICD code prediction.
        icd_definitions (str): The ICD code definitions and descriptions to be used for assigning diagnosis codes.
        subject_id (str): A unique identifier for the patient.
        subject_details (str): Information about the patient.
        admission_id (str): A unique identifier for the patient's admission.
        admission_details (str): Information regarding the patient's hospital admission.
        drg_details (str): Diagnosis-Related Group information for the patient.
        medication_details (str): Information about the patient's medications.
        labevents_details (str): Information about the patient's lab events.
        microbiology_details (str): Microbiology information related to the patient's case.
        pharmacy_details (str): Pharmacy-related details.

    Output:
        JSON object: A structured output that includes the predicted ICD codes for the patient based on the given data. 
        The output will contain 'subject_id', 'admission_id', 'icd_code', and 'predictive_reasoning' in JSON format.

    This function communicates with the LLM by formatting input data into a structured prompt. It uses retries in case the 
    LLM output is not in the correct format or does not include a valid ICD code.
    """

    # Define the role of the LLM as a clinical coder, providing the ICD definitions and a training example for context.
    # Set up the system's instructions and prompt format.
    
    system_role = f"""
    You are a proficient clinical coder. Your task is to review the provided notes and patient information and assign the appropriate ICD diagnosis codes strictly from the following ICD definitions: {icd_definitions}. Ensure that all diagnosis codes are from the provided list, following the guidelines exactly. You will receive patient details and associated clinical data, and your task is to assign the correct diagnosis code based on that information. Here is a training example for your reference: {training_sample}.
    """
    human_prompt = f"""
    Assign the correct ICD code based strictly on the data provided. Follow these rules:
    - Only use ICD codes from the list.
    - Ensure the code is under 6 characters.
    - Output the JSON as instructed.

    Patient Information:
    - ID: {subject_id}
    - Details: {subject_details}

    Admission Details:
    - Admission ID: {admission_id}
    - Notes: {admission_details}

    Other Clinical Data:
    - DRG: {drg_details}
    - Medication: {medication_details}
    - Lab Results: {labevents_details}
    - Microbiology: {microbiology_details}
    - Pharmacy: {pharmacy_details}
        """

    # Create a structured LLM interface with patient data, formatting the system and human prompts for the LLM to process.
    
    structured_llm = llm.with_structured_output(Patient, method="json_mode")
    prompt = ChatPromptTemplate(
        messages=[
            SystemMessage(content=system_role),
            HumanMessagePromptTemplate.from_template(
                human_prompt
            ),
        ],
    )
    _input = prompt.format_prompt(
        subject_id=subject_id,
        subject_details=subject_details, 
        admission_id=admission_id,
        admission_details=admission_details,
        drg_details=drg_details,
        medication_details=medication_details,
        labevents_details=labevents_details,
        microbiology_details=microbiology_details,
        pharmacy_details=pharmacy_details,
    )

    # Extract valid ICD codes from the provided definitions and initialize a retry loop to handle errors or invalid outputs.
    

    icd_def_codes = [word.replace("- ", "").split(":")[0] for word in icd_definitions.split('\n') if word.startswith("-")]
    success = False
    retries = 0
    while not success and retries < 5:
        try:
            # Construct the LLM prompt and invoke the model to get ICD code predictions in JSON format.
            base_prompt = f"""
            - Use the actual patient ID (e.g., "12874260.0") as the dictionary key.
            - The value for each patient ID should be a list of JSON objects, each containing:
            - 'admission_id': The admission ID
            - 'icd_code': The ICD diagnosis code
            - 'predictive_reasoning': A brief phrase highlighting relevant symptoms or keywords.
            {_input.to_messages()}
            """
            output = structured_llm.invoke(base_prompt)
            
            # Check if the output is a valid dictionary, and ensure the predicted ICD codes are from the predefined list.
            if type(output.__root__) is dict:
                codes = []
                for adm in output.__root__[f"{subject_id}"]:
                    codes.append(adm.icd_code)
                if any(x in icd_def_codes for x in codes):
                    success = True
                    print("Success")
                    return output
                else:
                    print("Didn't output a correct ICD code")
                    retries += 1
                    base_prompt = "IMPORTANT: ENSURE THAT YOU OUTPUT AN ICD CODE FROM THE DEFINITIONS BELOW. " + base_prompt
            else:
                print("Output was not of dictionary type")
                retries += 1
                base_prompt = base_prompt + "IMPORTANT: ENSURE THAT THE OUTPUT IS A DICTIONARY"
            
        except Exception as e:
            # Handle any exceptions that occur, logging errors and retrying if necessary.
            print(f"Retrying as output was not dict - Error: {e}")
            retries += 1
    
    # After 5 unsuccessful attempts, print an error message indicating the retry limit was exceeded.
    if not success:
        print("Tried more than 5 times ")

def get_output(training_sample, all_subjects, icd_definitions, project_dir, llm):
    """
    Generates ICD code predictions for patients using a language model and stores the output in a file.

    Input:
    - training_sample (list): A sample dataset used to train the LLM.
    - all_subjects (dict): A dictionary containing patient information, including admissions, diagnoses, and other details.
    - icd_definitions (str): A string with ICD code definitions, blocks, and hierarchical information.
    - project_dir (str): The directory where data is stored and output files are saved.
    - llm (object): A language model used to predict ICD codes based on patient data.

    Output:
    - pd.DataFrame: A DataFrame containing patient IDs, admission IDs, predicted ICD codes, and predictive reasoning.
    
    This function first checks if the predicted ICD data file already exists. If it doesn't, it initializes a DataFrame
    for storing the output. It then iterates through each patient and admission, using the language model to generate ICD
    code predictions along with reasoning, and appends this information to the DataFrame. The results are saved periodically
    to a temporary file and then to the final output file.
    """
    
    output_file = "llm_model_predicted_icd_data_temp.pkl"
    
    # Check if the output file already exists. If it does, load it; otherwise, initialize an empty DataFrame.
    if output_file in os.listdir(f"{project_dir}/data/altered_data"):
        print("Output file exists")
        df_output = pd.read_pickle(f"{project_dir}/data/altered_data/{output_file}")
    else:
        df_output = pd.DataFrame(columns=["subject_id", "admit_id", "predicted_icd", "predictive_reasoning"])
    
    iteration = 0
    
    # Iterate over each patient and their admissions. If predictions for a patient and admission already exist,
    # skip it; otherwise, call the LLM to predict ICD codes and reasoning.
    for i, (subj, info) in enumerate(all_subjects.items()):
        for admissions in info["admissions"]:
            for admit_id, admit_info in admissions.items():
                if (subj in df_output.subject_id.unique()) & (admit_id in df_output.admit_id.unique()):
                    print("Already outputted")
                    iteration += 1
                else:
                    # Call the LLM function to predict ICD codes and reasoning based on patient and admission details.
                    each_output = get_icd_llm(
                        llm=llm,
                        training_sample=training_sample, 
                        icd_definitions=icd_definitions, 
                        subject_id=subj, 
                        subject_details=info["subject_details"], 
                        admission_id=admit_id, 
                        admission_details=admit_info["admission_details"], 
                        drg_details=admit_info["drg_details"], 
                        medication_details=admit_info["medication_details"], 
                        labevents_details=admit_info["labevents_details"], 
                        microbiology_details=admit_info["microbiology_details"], 
                        pharmacy_details=admit_info["pharmacy_details"]
                        )

                    # Store the predicted ICD codes and reasoning in the output DataFrame.
                    codes = []
                    reasons = []
                    for adm in each_output.__root__[f"{subj}"]:
                        codes.append(adm.icd_code)
                        reasons.append(adm.predictive_reasoning)

                    df_output = df_output._append({
                        "subject_id": subj,
                        "admit_id": admit_id,
                        "predicted_icd": codes,
                        "predictive_reasoning": reasons
                        }, ignore_index=True)
                    
                    # Save the intermediate results to a temporary file to avoid data loss.
                    # df_output.to_pickle(f"{project_dir}/data/altered_data/llm_model_predicted_icd_data_temp.pkl")
                    iteration += 1
                
                # Log progress every 500 iterations.
                if iteration % 500 == 0:
                    logging.info(f"Iteration {i}/{len(all_subjects)}")
    
    # Save the final output to a persistent file and remove the temporary file.
    # df_output.to_pickle(f"{project_dir}/data/altered_data/llm_model_predicted_icd_data.pkl")
    # os.remove(f"{project_dir}/data/altered_data/llm_model_predicted_icd_data_temp.pkl")
    
    return df_output

In [54]:
sample_df = df.sample(1)
sample_text = convert_dataframe_to_text(sample_df)
for k, v in icd_code_mapping.items():
    if sample_df.icd_code_encoded.values[0] == v:
        sample_code = k
training_sample = """
Examples:
- Patient with heart failure and hypertension => ICD code: I110
- Patient with unexplained anemia symptoms => ICD code: D509
"""
with open(f"{proj_path}/data/definitions/icd_def_v2.txt", "r") as file:
    icd_definitions = file.read()

In [None]:
admissions_incorrectly_predicted = [22919017.0, 27856517.0, 21794773.0, 22874241.0, 27748832.0, 24862472.0, 28012650.0, 26700729.0, 23866567.0, 28208159.0]
all_subjects = convert_dataframe_to_text(df[np.in1d(df.index.get_level_values(1), admissions_incorrectly_predicted)])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(lambda x: truncate_text(x)).tolist()


In [61]:
subject_id = 12874260.0
subject_details = all_subjects[12874260.0]["subject_details"]
admission_id=27856517.0, 
admission_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["admission_details"], 
drg_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["drg_details"], 
medication_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["medication_details"], 
labevents_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["labevents_details"], 
microbiology_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["microbiology_details"], 
pharmacy_details=all_subjects[12874260.0]["admissions"][0][27856517.0]["pharmacy_details"]

In [74]:
system_role = f"""
You are a proficient clinical coder. Your task is to review the provided notes and patient information and assign the appropriate ICD diagnosis codes strictly from the following ICD definitions: {icd_definitions}. Ensure that all diagnosis codes are from the provided list, following the guidelines exactly. You will receive patient details and associated clinical data, and your task is to assign the correct diagnosis code based on that information. Here is a training example for your reference: {training_sample}.
"""
human_prompt = f"""
Assign the correct ICD code based strictly on the data provided. Follow these rules:
- Only use ICD codes from the list.
- Ensure the code is under 6 characters.
- Output the JSON as instructed.

Patient Information:
- ID: {subject_id}
- Details: {subject_details}

Admission Details:
- Admission ID: {admission_id}
- Notes: {admission_details}

Other Cliniczal Data:
- DRG: {drg_details}
- Medication: {medication_details}
- Lab Results: {labevents_details}
- Microbiology: {microbiology_details}
- Pharmacy: {pharmacy_details}
    """

# Create a structured LLM interface with patient data, formatting the system and human prompts for the LLM to process.

structured_llm = LLM.with_structured_output(Patient, method="json_mode")
prompt = ChatPromptTemplate(
    messages=[
        SystemMessage(content=system_role),
        HumanMessagePromptTemplate.from_template(
            human_prompt
        ),
    ],
)
_input = prompt.format_prompt(
    subject_id=subject_id,
    subject_details=subject_details, 
    admission_id=admission_id,
    admission_details=admission_details,
    drg_details=drg_details,
    medication_details=medication_details,
    labevents_details=labevents_details,
    microbiology_details=microbiology_details,
    pharmacy_details=pharmacy_details,
)

# Extract valid ICD codes from the provided definitions and initialize a retry loop to handle errors or invalid outputs.


icd_def_codes = [word.replace("- ", "").split(":")[0] for word in icd_definitions.split('\n') if word.startswith("-")]
success = False
retries = 0

base_prompt = f"""
Respond in JSON format with the following structure:
- Use the actual patient ID (e.g., "12874260.0") as the dictionary key.
- The value for each patient ID should be a list of JSON objects, each containing:
  - 'admission_id': The admission ID
  - 'icd_code': The ICD diagnosis code
  - 'predictive_reasoning': A brief phrase highlighting relevant symptoms or keywords.
{_input.to_messages()}
"""
output = structured_llm.invoke(base_prompt)

In [75]:
output.__root__

{'12874260.0': [Admission(admission_id='27856517.0', icd_code='D509', predictive_reasoning='Unexplained anemia symptoms')]}

In [76]:
llm_output = get_output(training_sample, all_subjects, icd_definitions, project_dir=proj_path, llm=LLM)


Success
Already outputted


  df_output = df_output._append({


Success
Success
Success
Success
Success
Success
Success
Success
Success


In [77]:
llm_output["llm_predicted_icd_code_encoded"] = llm_output.predicted_icd.apply(lambda x: icd_code_mapping[x[0]])

output_df = pd.read_pickle(f"{proj_path}/data/altered_data/all_models_prediction_output.pkl")

output_df = output_df[output_df.hadm_id.isin(admissions_incorrectly_predicted)][["subject_id", "hadm_id", "icd_code_encoded"]].merge(
    llm_output.rename(columns={
        "predicted_icd":"llm_predicted_icd",
        "predictive_reasoning": "llm_predictive_reasoning"
    }),
    left_on=["subject_id", "hadm_id"], right_on=["subject_id", "admit_id"], how="left"
).drop("admit_id", axis=1)

In [81]:
test = output_df[["subject_id", "hadm_id", "icd_code_encoded", "llm_predicted_icd_code_encoded"]]
test["llm_inaccurate_preds"] = np.where(test.icd_code_encoded != test.llm_predicted_icd_code_encoded, 1, 0)
print(test.llm_inaccurate_preds.value_counts())
test

llm_inaccurate_preds
1    8
0    3
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["llm_inaccurate_preds"] = np.where(test.icd_code_encoded != test.llm_predicted_icd_code_encoded, 1, 0)


Unnamed: 0,subject_id,hadm_id,icd_code_encoded,llm_predicted_icd_code_encoded,llm_inaccurate_preds
0,12874260.0,27856517.0,0,0,0
1,14040440.0,23866567.0,1,0,1
2,16086123.0,26700729.0,1,2,1
3,19026279.0,24862472.0,1,0,1
4,12019744.0,22874241.0,1,2,1
5,15152711.0,27748832.0,2,0,1
6,18968810.0,22919017.0,2,0,1
7,12874260.0,27856517.0,2,0,1
8,14856789.0,28208159.0,2,3,1
9,15127661.0,21794773.0,2,2,0


In [33]:
output_df = pd.read_pickle(f"{proj_path}/data/altered_data/all_models_prediction_output.pkl")
output_df[output_df.hadm_id.isin(admissions_incorrectly_predicted)][["subject_id", "hadm_id", "icd_code_encoded", "llm_predicted_icd_code_encoded"]]

Unnamed: 0,subject_id,hadm_id,icd_code_encoded,llm_predicted_icd_code_encoded
360,12874260.0,27856517.0,0,4
3151,14040440.0,23866567.0,1,4
3183,16086123.0,26700729.0,1,2
3286,19026279.0,24862472.0,1,4
3750,12019744.0,22874241.0,1,2
5866,15152711.0,27748832.0,2,1
6041,18968810.0,22919017.0,2,4
6983,12874260.0,27856517.0,2,4
7049,14856789.0,28208159.0,2,3
7111,15127661.0,21794773.0,2,4
