In [None]:
pip install --upgrade --quiet langchain-core langchain langchain-openai

In [None]:
import os
from pydantic import BaseModel
from openai import AzureOpenAI

api_key = "<key>"
api_endpoint = "https://<instance>.openai.azure.com/"

os.environ["OPENAI_API_KEY"] = "" 
os.environ["AZURE_OPENAI_API_KEY"] = api_key
os.environ["AZURE_OPENAI_ENDPOINT"] = api_endpoint

model_name= "gpt-4o"
api_version = '2024-12-01-preview'

In [None]:
prompt_file = "prompts/prompt_lang_parameter.txt"
example_file = "prompts/example_greek.txt"
example_patient_text_file = "prompts/example_file_greek.txt"
example_entities_file = "prompts/example_entities_greek.txt"
lang = "Greek"

dev_dataset_path = 'elcardiocc/dev_dataset.tsv'
predicted_results_path = f'predictions/elcardiocc_dev_dataset_full_linking_{model_name}_zero.tsv'
predicted_entities_path = f'predictions/elcardio_predicted_entities_full_linking_{model_name}_zero.tsv'

In [None]:
from langchain_openai import OpenAI

In [None]:
from ast import literal_eval

prompt, examples, patient_text = '', '', ''
ner_entities = []
delimiter = '[TERM]'
delimiter_name = "special tokens"

with open(prompt_file, 'r', encoding="utf-8") as file:
    prompt = file.read()

with open(example_file, 'r', encoding="utf-8") as file:
    examples = file.read().replace('{delimiter}', delimiter)

examples = "" # zero shot

with open(example_patient_text_file, 'r', encoding="utf-8") as file:
    patient_text = file.read()

with open(example_entities_file, 'r', encoding="utf-8") as file:
    text = file.read()
    ner_entities = literal_eval(text)

In [None]:
def prepare_text(text, entities):
    text_parts = []
    last_entity_end = 0
    for entity in entities:
        start = entity['start']
        end = entity['end']
        text_parts.append(text[last_entity_end:start])
        text_parts.append(f"{delimiter}{text[start:end]}{delimiter}")
        last_entity_end = end

    text_parts.append(text[last_entity_end:])
    return "".join(text_parts)

In [None]:
prepare_text(patient_text, ner_entities)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field

tagging_prompt = ChatPromptTemplate.from_template(
        prompt
    )

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain.globals import set_verbose
from langchain_openai import ChatOpenAI

if os.environ["OPENAI_API_KEY"] == '': #use azure deployment
    llm = AzureChatOpenAI(
        azure_deployment=model_name,
        api_version=api_version,
        temperature=0.5,
        max_tokens=6000,
        timeout=None,
        max_retries=2,
    ).with_structured_output(method="json_mode")
else:    
    llm = ChatOpenAI(
        model=model_name,
        temperature=0.5,
        max_tokens=6000,
        timeout=None,
        max_retries=2
    ).with_structured_output(method="json_mode")

In [None]:
prep_text = prepare_text(patient_text, ner_entities)
prompt = tagging_prompt.invoke({"clinical_text": prep_text, "language": lang, "language_lower": lang.lower(), "examples": examples, "delimiter": delimiter, "delimiter_name": delimiter_name})
prompt

In [None]:
response = llm.invoke(prompt)
response

In [None]:
import pandas as pd
dev_dataset = pd.read_csv(dev_dataset_path, sep='\t')
dev_dataset.head()

In [None]:
from ast import literal_eval

dev_dataset['annotations'] = dev_dataset['annotations'].apply(literal_eval)
dev_dataset['response'] = ''

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

for index, row in tqdm(dev_dataset.iterrows(), total=dev_dataset.shape[0]):
    if row['response'] != '':
        continue
    text = row['text']
    prep_text = prepare_text(text, row['annotations'])
    prompt = tagging_prompt.invoke({"clinical_text": prep_text, "language": lang, "language_lower": lang.lower(), "examples": examples, "delimiter": delimiter, "delimiter_name": delimiter_name})
    response = llm.invoke(prompt)
    dev_dataset.at[index, 'response'] = response

In [None]:
dev_dataset.head()

In [None]:
dev_dataset[dev_dataset['response']=='']

In [None]:
dev_dataset.to_csv(predicted_results_path, index=False, sep='\t')

In [None]:
dev_dataset.shape

In [None]:
dev_dataset['text'].iloc[32]

In [None]:
# postprocessing

In [None]:
def get_occurrences(term, text):
    occurrences = []
    i = 0
    while True:
    	f = text.find(term, i)
    	if f==-1:
    		break
    	occurrences.append(f)
    	i = f+1
    return occurrences

In [None]:
import re

lang_lower = lang.lower()
entities_list = []
last_end_index = 0

for index, row in dev_dataset.iterrows():
    if row['response'] == '':
        continue
        
    ents = row['response']
    text = row['text']
    
    keys = list(ents.keys())
    if len(keys) > 1:
        entities = [ row['response'] ]
    else:
        key = keys[0]
        entities = row['response'][key]
    entity_list = []
    for ent in entities:
        if not isinstance(ent, dict):
            print(row['response'])
            continue
        term = ent[f'medical_term_{lang_lower}'].replace(delimiter, '') #strip if *
        code = ent['icd10_code']

        if term.upper() not in text.upper():
            continue
            
        indices = [(m, m+len(term)) for m in get_occurrences(term.upper(), text.upper())]
        for index in indices:
            start, end = index
            if term.upper() != text.upper()[start:end].upper():
                print(term, text[start:end])
            entity_list.append({
                'filename': row['id'],
                'ann_id': 'ICD',
                'label': 'ICD',
                'start_span': start,
                'end_span': end,
                'text': term,
                'code': code #[0:3]
            })
        
    entities_list.append(entity_list)
    

In [None]:
len(entities_list)

In [None]:
merged_list = []
for index, row in dev_dataset.iterrows():     
    if row['response'] == '':
        continue
           
    pred_list = entities_list[index]
    true_list = row['annotations']
    text = row['text']
    code_indices = ['' for i in range(0, len(text))]
    
    for entity in pred_list:
        start = entity['start_span']
        end = entity['end_span']
        for i in range(start, end):
            code_indices[i] = entity['code']

    merged_pred_list = []
    for entity in true_list:
        start = entity['start']
        end = entity['end']
        for i in range(start, end):
            if code_indices[i] != '':
                entity['filename'] = row['id']
                entity['ann_id'] = 'ICD'
                entity['label'] = 'ICD' # 'ICD'
                entity['text'] = text[start:end]
                entity['code'] = code_indices[i]                
                merged_pred_list.append(entity)
                break

    merged_list.append(merged_pred_list)

In [None]:
import pandas as pd

df_entities_list = pd.DataFrame.from_records(sum(merged_list, []))
df_entities_list.drop_duplicates(inplace=True)
df_entities_list['code'] =  df_entities_list['code'].apply(str).apply(lambda x: x[0:3])
df_entities_list = df_entities_list.rename(columns={'start':'start_span', 'end': 'end_span'})
df_entities_list.head()

In [None]:
df_entities_list[['filename','ann_id','label','start_span','end_span','text','code']].to_csv(predicted_entities_path, sep='\t', index=False)

In [None]:
# combine predictions

In [None]:
df_dict = pd.read_csv('elcardiocc/train_subtask2_direct_match_predictions.tsv', sep='\t', keep_default_na=False)
df_dict['filename']=df_dict['id']
df_dict['start_span']=df_dict['start']
df_dict['end_span']=df_dict['end']
df_dict.head()

In [None]:
for index, row in df_dict.iterrows():
    if row['predicted_code'] != '':
        continue
    filter_start = df_entities_list['start_span'] == int(row['start_span'])
    filter_end = df_entities_list['end_span'] == int(row['end_span'])
    filter_file = df_entities_list['filename'] == int(row['filename'])
    df_filter = df_entities_list[filter_start & filter_end & filter_file]
    if df_filter.shape[0] > 0:
        gpt_code = df_filter.iloc[0]['code']
        df_dict.at[index, 'predicted_code'] = gpt_code

In [None]:
df_dict.head()

In [None]:
df_dict['ann_id']='ICD'
df_dict['label']='ICD'
df_dict['text']=df_dict['mention']
df_dict = df_dict.rename(columns={'code':'true_code'})
df_dict[['filename','ann_id','label','start_span','end_span','text','predicted_code']].rename(columns={'predicted_code':'code'}).to_csv(f'{predicted_entities_path}_dict_cat.tsv', sep='\t', index=False)