#### LangChain patches

In [None]:
# Patches: LangChain
# tools/vectorstore/tools.py -> _run
# as_retriever() -->
#   as_retriever(search_kwargs={"k": 15})

# schema/vectorstore;py -> _get_relevant_documents
# docs = self.vectorstore.similarity_search(query, **self.search_kwargs) -->
#   docs = self.vectorstore.similarity_search(query, k=1000)[:self.search_kwargs.get("k", 4)]

## Load vector store

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [None]:
# thenlper/gte-large
embedding_function = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

In [None]:
import chromadb
from langchain.vectorstores import Chroma

In [None]:
chroma_client = chromadb.HttpClient(host='<CHROMA HOST>', port='<CHROMA PORT>')

In [None]:
# chroma_gte_snomed_namesyns
# chroma_gte_rxnorm_name
chroma_db = Chroma(
    collection_name='chroma_gte_snomed_namesyns',
    embedding_function=embedding_function,
    client=chroma_client,
    collection_metadata={'hnsw:space': 'cosine'},
)

In [None]:
chroma_db._collection.count()

In [None]:
from langchain.agents.openai_assistant import OpenAIAssistantRunnable
import json
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)
from langchain.agents import AgentType
from langchain.agents import AgentExecutor
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

In [None]:
import os
os.environ['OPENAI_API_KEY'] = "<OPENAI API KEY>"

In [None]:
import ast
import pandas as pd

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

## LLM agent - Procedures subset

In [None]:
sigtap_df = pd.read_csv('<ANNOTATED SIGTAP TABLE>.csv', sep='\t', converters={'ID': str, 'sourceCode': str},
                        names=['ID', 'Name', 'Description', 'sourceCode', 'conceptId'], header=0)

In [None]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else None
    except (ValueError, SyntaxError):
        return []

In [None]:
sigtap_df['conceptId'] = sigtap_df['conceptId'].apply(safe_literal_eval)

In [None]:
sigtap_df['Name'] = sigtap_df['Name'].str.strip()

In [None]:
sigtap_df['Description'] = sigtap_df['Description'].str.strip()

In [None]:
sigtap_df

In [None]:
# Filter procedures subset
sigtap_procedures_original = sigtap_df[sigtap_df.ID.str[:2] != '06'].copy()

In [None]:
len(sigtap_procedures_original[~sigtap_procedures_original.conceptId.isna()])

In [None]:
sigtap_procedures = sigtap_procedures_original[~sigtap_procedures_original.conceptId.isna()].sample(n=50, random_state=0)

In [None]:
sigtap_procedures.sort_index()

In [None]:
vectorstore_info = VectorStoreInfo(
    name="snomed_concepts",
    description="concepts in the SNOMED CT vocabulary using a vector store",
    vectorstore=chroma_db,
)

In [None]:
def assistant_procedures_search(row):
    if 'llm_output' in row and row.llm_output != 'Error':
        return row.llm_output, row.llm_concepts, row.matched
    llm = ChatOpenAI(openai_api_key="<OPENAI API KEY>", temperature=0.0, model='gpt-4-1106-preview')
    toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info, llm=llm)
    tools = [toolkit.get_tools()[1]]
    instructions = (
        'You are a system tasked with searching through the SNOMED terminology to find the single closest/most similar term '
        'to a given term, from another terminology, that I will provide. To search the SNOMED terminology, you have access to a vector '
        'store containing the entire SNOMED terminology, so make sure to use it. If you don\'t think any of the results you get '
        'from the vector store matches the given term adequately, you may change the given term without changing its meaning too much '
        '(using synonyms, variations, and other changes) and try it again. You may repeat this process a few times and return the best '
        'match you get. As a heads up, the given terms come from another terminology, so they will rarely, if ever, have an '
        'exact match in SNOMED. Also include the sources with your answer.'
    )
    try:
        agent = OpenAIAssistantRunnable.create_assistant(
            name="langchain snomed vs tool",
            instructions=instructions,
            tools=tools,
            model="gpt-4-1106-preview",
            as_agent=True,
        )
    except Exception as e:
        print(str(e))
        return 'Error', [], False
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False, return_intermediate_steps=True)
    try:
        description = ''
        if type(row.Description) == str:
            description = ' Description of the given term: "{}"'.format(row.Description.capitalize())
        response = agent_executor.invoke({"content": 'What is the closest term to the given term = "{}".{}'.format(row.Name.capitalize(), description)})
    except Exception as e:
        print(str(e))
        raise Exception
        return 'Error', [], False
    try:
        queried_concepts = [(concept.get('concept_id'), concept.get('concept_name')) for concept in chroma_db._collection.get(ids=json.loads(response['intermediate_steps'][-1][1]).get('sources').split(', '), include=['metadatas'])['metadatas']]
    except:
        queried_concepts = ''
    matched = False
    if queried_concepts:
        concept_list = [concept[0] for concept in queried_concepts]
        matched = bool(set(concept_list).intersection(set(row.conceptId))) if row.conceptId else False
    return response, queried_concepts, matched

In [None]:
sigtap_procedures[['llm_output', 'llm_concepts', 'matched']] = sigtap_procedures.progress_apply(assistant_procedures_search, axis=1, result_type='expand')

In [None]:
sigtap_procedures

In [None]:
print('Number of matches = ', len(sigtap_procedures[sigtap_procedures.matched == True]))
print('Number of valid outputs = ', len(sigtap_procedures[(~sigtap_procedures.llm_concepts.isna()) & (sigtap_procedures.llm_concepts.str.len() > 0)]))

In [None]:
print('Number of invalid outputs = ', len(sigtap_procedures[sigtap_procedures.llm_concepts.str.len() == 0]))

In [None]:
sigtap_procedures.to_csv('sigtap_procedures_gpt4_assistant.csv', sep='\t', index=False)

## LLM agent - Medicines subset

In [None]:
sigtap_df = pd.read_csv('<ANNOTATED SIGTAP TABLE>.csv', sep='\t', converters={'ID': str, 'sourceCode': str},
                        names=['ID', 'Name', 'Description', 'sourceCode', 'conceptId'], header=0)

In [None]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else None
    except (ValueError, SyntaxError):
        return []

In [None]:
sigtap_df['conceptId'] = sigtap_df['conceptId'].apply(safe_literal_eval)

In [None]:
sigtap_df['Name'] = sigtap_df['Name'].str.strip()

In [None]:
sigtap_df['Description'] = sigtap_df['Description'].str.strip()

In [None]:
sigtap_df

In [None]:
# Filter medicines subset
sigtap_medicines = sigtap_df[sigtap_df.ID.str[:2] == '06'].copy()

In [None]:
len(sigtap_medicines[~sigtap_medicines.conceptId.isna()])

In [None]:
sigtap_medicines = sigtap_medicines[~sigtap_medicines.conceptId.isna()].sample(n=50, random_state=0)

In [None]:
sigtap_medicines.sort_index()

In [None]:
vectorstore_info = VectorStoreInfo(
    name="rxnorm_concepts",
    description="concepts in the RxNorm vocabulary using a vector store",
    vectorstore=chroma_db,
)

In [None]:
def assistant_medicines_search(row):
    if 'llm_output' in row and row.llm_output != 'Error':
        return row.llm_output, row.llm_concepts, row.matched
    llm = ChatOpenAI(openai_api_key="<OPENAI API KEY>", temperature=0.0, model='gpt-4-1106-preview')
    toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info, llm=llm)
    tools = [toolkit.get_tools()[1]]
    instructions = (
        'You are a system tasked with searching through the RxNorm terminology to find the single closest/most similar term '
        'to a given term, from another terminology, that I will provide. To search the RxNorm terminology, you have access to a vector '
        'store containing the entire RxNorm terminology, so make sure to use it. If you don\'t think any of the results you get '
        'from the vector store matches the given term adequately, you may change the given term without changing its meaning too much '
        '(using synonyms, variations, and other changes) and try it again. You may repeat this process a few times and return the best '
        'match you get. As a heads up, the given terms come from another terminology, so they will rarely, if ever, have an '
        'exact match in RxNorm. Also include the sources with your answer.'
    )
    try:
        agent = OpenAIAssistantRunnable.create_assistant(
            name="langchain rxnorm vs tool",
            instructions=instructions,
            tools=tools,
            model="gpt-4-1106-preview",
            as_agent=True,
        )
    except:
        return 'Error', [], False
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False, return_intermediate_steps=True)
    try:
        response = agent_executor.invoke({"content": 'What is the closest term to the given term = "{}"'.format(row.Name.capitalize())})
    except:
        return 'Error', [], False
    try:
        queried_concepts = [(concept.get('concept_id'), concept.get('concept_name')) for concept in chroma_db._collection.get(ids=json.loads(response['intermediate_steps'][-1][1]).get('sources').split(', '), include=['metadatas'])['metadatas']]
    except:
        queried_concepts = ''
    matched = False
    if queried_concepts:
        concept_list = [concept[0] for concept in queried_concepts]
        matched = bool(set(concept_list).intersection(set(row.conceptId))) if row.conceptId else False
    return response, queried_concepts, matched

In [None]:
sigtap_medicines[['llm_output', 'llm_concepts', 'matched']] = sigtap_medicines.progress_apply(assistant_medicines_search, axis=1, result_type='expand')

In [None]:
def check_concepts(row):

    queried_concepts = [concept.get('concept_id') for concept in chroma_db._collection.get(ids=json.loads(row.llm_output['intermediate_steps'][-1][1]).get('sources').split(', '), include=['metadatas'])['metadatas']]

    return queried_concepts

In [None]:
sigtap_medicines['llm_concepts'] = sigtap_medicines.progress_apply(check_concepts, axis=1)

In [None]:
sigtap_medicines

In [None]:
print('Number of matches = ', len(sigtap_medicines[sigtap_medicines.matched == True]))
print('Number of valid outputs = ', len(sigtap_medicines[(~sigtap_medicines.llm_concepts.isna()) & (sigtap_medicines.llm_concepts.str.len() > 0)]))

In [None]:
print('Number of invalid outputs = ', len(sigtap_medicines[sigtap_medicines.llm_concepts.str.len() == 0]))

In [None]:
sigtap_procedures.to_csv('sigtap_meds_gpt4_assistant.csv', sep='\t', index=False)