## Preprocessing

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('./sample_data/example_candidates_for_reranking.xlsx') # an output file from the SapBERT candidates generation step 

In [None]:
df.head()

In [None]:
# The output file from the candidates generation step contains some artifacts that we must remove
df.rename(columns={'Unnamed: 0': 'code'}, inplace=True)

In [None]:
def cleanUp(str):
    return str.replace('tensor', '').replace('(', '').replace(')', '')

In [None]:
df['code'] = df['code'].transform(lambda l: l.strip('tensor()'))
df['top_1'] = df['top_1'].transform(cleanUp)
df['top_5'] = df['top_5'].transform(cleanUp)
df['top_10'] = df['top_10'].transform(cleanUp)
df['top_25'] = df['top_25'].transform(cleanUp)

In [None]:
df[df['code'] == '276412008']

In [None]:
grouped_by_file_name = df.groupby('filename')

In [None]:
grouped_by_file_name

## Reranking

In [None]:
import tiktoken
from openai import OpenAI
import getpass

api_key = getpass.getpass('Enter your OpenAI API key:')

model = 'gpt-3.5-turbo-0125'
client = OpenAI(api_key=api_key) # best practices right here
symptemist_dataset_text_files_path = '../symptemist-complete_240208/symptemist_test/subtask1-ner/txt'
output_path = '/output/path' # must have two subdirs - json and dfs for the raw and processed reranking output files

In [None]:
def infer(messages):
 return client.chat.completions.create(
      model=model,
      messages=messages,
      temperature=0.4,
      max_tokens=4095,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    ).choices[0].message.content

In [None]:
def format_user_message(message):
    return {
        'role': 'user',
        'content': message
    }

In [None]:
import json

def extract_json_from_string(str):
    return json.loads(str.strip('`json'))

In [None]:
def get_term_candidates_object_json(entity, candidates_list):
    return f"""{{
  "query term": "{entity}",
  "candidates": {candidates_list}
}}"""

In [None]:
print(get_term_candidates_object_json("ENTITY", '["C1", "C2"]'))

In [None]:
def get_prompt(patient_case_text, term, candidate_list):    
    return f"""You are the AI assistant of a medical doctor. You are given:
- a patient case report text
- a query term
- a list of candidate terms

Your task is to normalize the query term to one of the candidate terms that best matches the meaning of the query term, considering the context of the patient case report text. Only choose a term from the given list of candidate terms.

Example patient case report:
```
Remitimos el caso de una paciente de 73 años que consulta por ictericia y síndrome constitucional de menos de 2 meses de evolución. En el Servicio de Urgencias se evidenció una gran hepatomegalia dolorosa y en las pruebas analíticas destacaba leucocitosis con neutrofilia y alteración mixta del perfil hepático de predominio colestásico (fosfatasa alcalina y GGT más de 10 veces el valor normal y ALT y AST menos de 3 veces el valor normal), sin insuficiencia hepática.

A las 24 horas del ingreso comenzó con un cuadro confusional que evolucionó rápidamente al coma y fiebre mayor de 39 ºC. Se realizaron ecografía y TAC abdominales, objetivando múltiples lesiones hepáticas ocupantes de espacio, hipodensas, sólidas y que se distribuían en ambos lóbulos ocupando prácticamente todo el órgano, sugerentes de MTS. No se encontró ningún foco infeccioso, varios hemocultivos fueron estériles y, mediante TAC, se había descartado la presencia de lesiones cerebrales.

Analíticamente se deterioró el perfil hepático presentando una importante elevación de transaminasas en el rango de hepatitis aguda (ALT y AST mayores de 20 veces el valor normal con importante aumento de la LDH) y datos de insuficiencia hepática con deterioro progresivo de la función renal.
A pesar de tratamiento intensivo con medidas antiencefalopatía, drogas vasoactivas, antibióticos de amplio espectro a dosis elevadas (para cubrir como posible foco el SNC) y transfusión de plasma fresco congelado, la paciente falleció a los 5 días del ingreso como consecuencia de un fallo multiorgánico, sin llegar a establecerse la naturaleza de las lesiones hepáticas ni su origen debido a la rápida evolución. Por este motivo se realizó la autopsia clínica.
En la necropsia se confirmó la existencia de MTS hepáticas extensas. El tumor primario fue un adenocarcinoma cecal de 3 x 2 cm de diámetro, estadio D de Dukes y IV de Astler-Coller. Presentaba infiltración de serosa y grasa perivascular e infiltración linfática y venular. MTS en ganglios locorregionales infradiafragmáticos y en parénquima pulmonar. Además se observaron lesiones de hepatitis isquémica asociada y colangiolitis y extensa autolisis pancreática y peripancreática.
```

Examples query and candidate terms:
```
{{
    "query term": "neutrophilia",
    "candidates": ["pseudoneutrofilia", "neutrofilia", "leucocitosis neutrofílica", "neutrofilia (hallazgo)"]
}}
```

Example result:
```
{{
    "answer": "neutrofilia"
}}
```

Now please normalize the following:

Patient case report: 
```
{patient_case_text}
```

Query term and candidate terms:
```
{get_term_candidates_object_json(term, candidate_list)}
```
"""

In [None]:
print(get_prompt("CASE TEXT", "a", [1, 2, 3]))

In [None]:
def determine_best_candidates(file_name, terms, candidate_term_lists, candidate_code_lists):
    with open(f'{symptemist_dataset_text_files_path}/{file_name}.txt', 'r', encoding='utf8') as patient_case_file:
        patient_case_text = patient_case_file.read()

        original_terms = []
        best_candidate_terms = []
        best_candidate_codes = []
        
        for term_index, (term, candidate_terms, candidate_codes) in enumerate(zip(terms, candidate_term_lists, candidate_code_lists)):
            prompt = get_prompt(patient_case_text, term, candidate_terms)
            messages = [format_user_message(prompt)]
            result = infer(messages)
        
            with open(f'{output_path}/json/{file_name}-{term_index}.json', 'w', encoding='utf8') as out_file:
                out_file.write(result)

            result_json = extract_json_from_string(result)
            best_candidate_term = result_json['answer']

            original_terms.append(term)
            if best_candidate_term not in candidate_terms:
                print(f'Bad result. No matching term in candidates list. Skipping {term} in file {file_name}.')
                best_candidate_terms.append(None)
                best_candidate_codes.append(0)
                continue
            
            best_candidate_terms.append(best_candidate_term)
            best_candidate_codes.append(candidate_codes[candidate_terms.index(best_candidate_term)])

        return original_terms, best_candidate_terms, best_candidate_codes

In [None]:
import ast
import os

done = [file_name.strip('.tsv') for file_name in os.listdir(f'{output_path}/dfs')]

for group_file_name, group_data in grouped_by_file_name:
    if group_file_name in done: 
        print(f'Skipping {group_file_name}')
        continue

    print(f'Processing {group_file_name}')
    
    top_texts_list = []
    top_codes_list = []
    entities = []

    for _, row in group_data.iterrows():
        texts_column_name, codes_column_name = 'top_5_texts', 'top_5'
        entities.append(row['term'])
        top_texts_list.append(ast.literal_eval(row[texts_column_name]))
        top_codes_list.append(ast.literal_eval(row[codes_column_name]))

    original_entities, best_texts, best_codes = determine_best_candidates(group_file_name, entities, top_texts_list, top_codes_list)
    
    result_df = group_data.copy(deep=True)
    result_df['original_entity'] = original_entities
    result_df['new_best_text'] = best_texts
    result_df['new_best_code'] = best_codes
    result_df.to_csv(f'{output_path}/dfs/{group_file_name}.tsv', sep='\t', encoding='utf8', index=False)

In [None]:
import os
result_dfs = []

for df_file_name in os.listdir(f'{output_path}/dfs'):
    result_dfs.append(pd.read_csv(f'{output_path}/dfs/{df_file_name}', sep='\t', encoding='utf8'))

In [None]:
final = pd.concat(result_dfs, ignore_index=True)

In [None]:
final.to_csv('reranking_results.tsv', sep='\t', encoding='utf8', index=False)

In [None]:
final['code'] = final['code'].astype(str)
final['new_best_code'] = final['new_best_code'].astype(str)

In [None]:
final[['code', 'new_best_code']].to_excel(f'{output_path}/reranking_result_codes.xlsx', index=False)

In [None]:
final.to_excel(f'{output_path}/reranking_results.xlsx', index=False)