### Local LLM - RAG pipeline

In [None]:
import ast
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
sigtap_meds = pd.read_csv('<Filtered candidates>.csv', sep='\t',
                      header=0, converters={'ID': str, 'sourceCode': str})

In [None]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else None
    except (ValueError, SyntaxError):
        return []

In [None]:
sigtap_meds['conceptId'] = sigtap_meds['conceptId'].progress_apply(safe_literal_eval)
sigtap_meds['nearest_concepts'] = sigtap_meds['nearest_concepts'].progress_apply(safe_literal_eval)

In [None]:
sigtap_meds

In [None]:
from llama_cpp import Llama, LlamaGrammar

In [None]:
# llm = Llama(model_path="./path_to_model/llama-2-7b-chat.Q8_0.gguf", n_ctx=4096, n_gpu_layers=32)
# llm = Llama(model_path="./path_to_model/llama-2-13b-chat.Q6_K.gguf", n_ctx=4096, n_gpu_layers=36)
llm = Llama(model_path="./path_to_model/mistral-7b-openorca.Q8_0.gguf", n_ctx=4096, n_gpu_layers=32)

In [None]:
grammar = LlamaGrammar.from_file("./path_to_grammar/json_no_numbers.gbnf")

In [None]:
import json

In [None]:
from thefuzz import fuzz

In [None]:
def calculate_fuzzy_distance(str1, str2):
    return fuzz.ratio(str1.lower(), str2.lower())

In [None]:
def rank_candidates(row):
    name = row['Name']
    candidates = row['nearest_concepts']

    # MPNET embeddings
    # distances = [(candidate[0], candidate[1], calculate_fuzzy_distance(name, candidate[1])) for candidate in candidates]
    # GTE embeddings
    distances = [(candidate[1], candidate[0], calculate_fuzzy_distance(name, candidate[0])) for candidate in candidates]

    sorted_distances = sorted(distances, key=lambda x: x[2], reverse=True)[:50]

    sorted_candidates = [candidate[0] for candidate in sorted_distances]

    ranked_matched = bool(set(sorted_candidates).intersection(set(row.conceptId))) if row.conceptId else False

    return (sorted_distances, ranked_matched)

In [None]:
sigtap_meds[['ranked_concepts', 'ranked_matched']] = sigtap_meds.progress_apply(rank_candidates, axis=1, result_type='expand')

In [None]:
print('Number of matches (filtered candidates) = ', len(sigtap_meds[sigtap_meds.ranked_matched == True]))

In [None]:
def query_llm(row):
    if not row.ranked_matched:
        return ""
    candidate_list = '\n'.join([f"{concept[0]} - {concept[1]}" for concept in row.ranked_concepts][:50])
    # LLaMA
    query = f"""[INST] <<SYS>>
You are a helpful assistant.
It does not matter whether the given term is a valid or complete medical term, your task is finding the closest one in the list.
Return your answer in JSON with the format {{"answer": "<numeric code>"}}
Copy the exact code in the answer.
<</SYS>>
Q: The given term "{row.Name.strip()}" is closest to which of the following alternatives:
{candidate_list}
A:
[/INST]"""
    # Mistral OpenOrca
#     query = f"""<|im_start|>system
# You are a helpful assistant.
# It does not matter whether the given term is a valid or complete medical term, your task is finding the closest one in the list.
# Return your answer in JSON with the format {{"answer": "<numeric code>"}}
# Copy the exact code in the answer.<|im_end|>
# <|im_start|>user
# Q: The given term "{row.Name.strip()}" is closest to which of the following alternatives:
# {candidate_list}<|im_end|>
# <|im_start|>assistant"""
    try:
        output = llm(query, max_tokens=32, grammar=grammar, echo=False, temperature=0)
    except:
        return "Exception: context too long"
    return output['choices'][0]['text']

In [None]:
sigtap_meds['llm_output'] = sigtap_meds.progress_apply(query_llm, axis=1)

In [None]:
sigtap_meds

In [None]:
import json
import re

pattern = r'\b(\d{6,})\b'

In [None]:
def parse_llm_output(row):
    if not row.llm_output:
        return False

    llm_concept = ''
    try:
        llm_concept = json.loads(row.llm_output).get('answer')
    except:
        pass

    if type(llm_concept) == dict:
        llm_concept = list(llm_concept.values())[0]

    if not llm_concept.isnumeric():
        llm_match = re.search(pattern, row.llm_output)
        if llm_match:
            llm_concept = llm_match.group(0)
        else:
            return False

    llm_matched = bool(set([llm_concept]).intersection(set(row.conceptId))) if row.conceptId else False

    return llm_matched

In [None]:
sigtap_meds['llm_match'] = sigtap_meds.progress_apply(parse_llm_output, axis=1)

In [None]:
print('Number of matches (LLM candidates) = ', len(sigtap_meds[sigtap_meds.llm_match == True]))

In [None]:
sigtap_meds['has_match'] = sigtap_meds['llm_output'].str.extract(pattern)

In [None]:
print('Number of pattern matches = ', len(sigtap_meds[~sigtap_meds.has_match.isna()]))

In [None]:
sigtap_meds.to_csv('sigtap_meds_llama_13b_notemp_grammar.csv', sep='\t', index=False)

In [None]:
json_answers = []
for index, row in sigtap_meds.iterrows():
    if '"answer"' in row.llm_output:
        json_answers.append(row.llm_output)
print(len(json_answers))

#### Remote LLM - Maritalk API

In [None]:
import maritalk

In [None]:
import time

In [None]:
model = maritalk.MariTalk(key="<MARITALK API KEY>")

In [None]:
def query_external_llm(row):
    if not row.ranked_matched:
        return ""
    candidate_list = '\n'.join([f"{concept[0]} - {concept[1]}" for concept in row.ranked_concepts][:50])
    query = f"""You are a helpful assistant.
It does not matter whether the given term is a valid or complete medical term, your task is finding the closest one in the list.
Return your answer in JSON with the format {{"answer": "<numeric code>"}}
Copy the exact code in the answer.
Q: The given term "{row.Name.strip()}" is closest to which of the following alternatives:
{candidate_list}
A:"""
    # External API rate limit = 5 seconds
    time.sleep(5)
    try:
        output = model.generate(query, max_tokens=32, do_sample=False)
    except:
        return "Exception: context too long"
    return output

In [None]:
sigtap_meds['llm_output'] = sigtap_meds.progress_apply(query_external_llm, axis=1)

In [None]:
sigtap_meds['has_match'] = sigtap_meds['llm_output'].str.extract(pattern)

In [None]:
print('Number of pattern matches = ', len(sigtap_meds[~sigtap_meds.has_match.isna()]))

In [None]:
def parse_llm_output(row):
    if not row.llm_output:
        return False

    llm_concept = ''
    try:
        llm_concept = json.loads(row.llm_output).get('answer')
    except:
        pass

    if type(llm_concept) == int:
        llm_concept = str(llm_concept)

    if type(llm_concept) == dict:
        llm_concept = list(llm_concept.values())[0]

    if not llm_concept.isnumeric():
        llm_match = re.search(pattern, row.llm_output)
        if llm_match:
            llm_concept = llm_match.group(0)
        else:
            return False

    llm_matched = bool(set([llm_concept]).intersection(set(row.conceptId))) if row.conceptId else False

    return llm_matched

In [None]:
sigtap_meds['llm_match'] = sigtap_meds.progress_apply(parse_llm_output, axis=1)

In [None]:
print('Number of matches (LLM candidates) = ', len(sigtap_meds[sigtap_meds.llm_match == True]))
print('Ranked matches = ', len(sigtap_meds[sigtap_meds.ranked_matched == True]))

In [None]:
sigtap_meds.to_csv('sigtap_meds_maritalk_gte_json.csv', sep='\t', index=False)

In [None]:
sigtap_meds

## Remote LLM - OpenAI GPT-4

In [None]:
from langchain.chat_models.openai import ChatOpenAI
from langchain.chains import LLMChain

In [None]:
llm = ChatOpenAI(openai_api_key="<OPENAI API KEY>", temperature=0.0, model='gpt-4-1106-preview', max_tokens=32) # gpt-4 128k

In [None]:
meds_subset = pd.read_csv('<Candidates subset>.csv', sep='\t',
                      header=0, converters={'ID': str, 'sourceCode': str})

In [None]:
# Select only subset
sigtap_meds = sigtap_meds[sigtap_meds['ID'].isin(meds_subset['ID'])].copy()

In [None]:
sigtap_meds

In [None]:
print('Number of matches = ', len(sigtap_meds[sigtap_meds.matched == True]))
print('Number of matches (ranked) = ', len(sigtap_meds[sigtap_meds.ranked_matched == True]))

In [None]:
def query_external_llm(row):
    if not row.ranked_matched:
        return ""
    candidate_list = '\n'.join([f"{concept[0]} - {concept[1]}" for concept in row.ranked_concepts][:50])
    query = f"""It does not matter whether the given term is a valid or complete medical term, your task is finding the closest one in the list.
Return your answer in JSON with the format {{"answer": "<numeric code>"}}
Copy the exact code in the answer.
Q: The given term "{row.Name.strip()}" is closest to which of the following alternatives:
{candidate_list}
A:"""
    # External API rate limit = 5 seconds
    time.sleep(5)
    try:
        output = llm.invoke(query)
    except:
        return "Exception: context too long"
    return output.content

In [None]:
sigtap_meds['llm_output'] = sigtap_meds[~sigtap_meds.nearest_concepts.isna()].progress_apply(query_external_llm, axis=1)

In [None]:
import re

pattern = r'(\d{6,})'

In [None]:
sigtap_meds['has_match'] = sigtap_meds['llm_output'].str.extract(pattern)

In [None]:
print('Number of pattern matches = ', len(sigtap_meds[~sigtap_meds.has_match.isna()]))

In [None]:
def parse_llm_output(row):
    if not row.llm_output:
        return False

    llm_concept = ''
    try:
        llm_concept = json.loads(row.llm_output).get('answer')
    except:
        pass

    if type(llm_concept) == int:
        llm_concept = str(llm_concept)

    if type(llm_concept) == dict:
        llm_concept = list(llm_concept.values())[0]

    if not llm_concept.isnumeric():
        llm_match = re.search(pattern, row.llm_output)
        if llm_match:
            llm_concept = llm_match.group(0)
        else:
            return False

    llm_matched = bool(set([llm_concept]).intersection(set(row.conceptId))) if row.conceptId else False

    return llm_matched

In [None]:
sigtap_meds['llm_match'] = sigtap_meds[~sigtap_meds.nearest_concepts.isna()].progress_apply(parse_llm_output, axis=1)

In [None]:
print('Number of matches (LLM candidates) = ', len(sigtap_meds[sigtap_meds.llm_match == True]))

In [None]:
sigtap_meds

In [None]:
sigtap_meds.to_csv('sigtap_meds_gpt4_rag.csv', sep='\t', index=False)