#### Vector store - sentence embeddings encoding

In [None]:
import pandas as pd

In [None]:
omop_df = pd.read_csv('omop_cui_defs_syns.csv', sep='\t',
                      converters={'concept_id': str, 'concept_code': str})
# SNOMED = 538088 rows
# omop_df = omop_df[(omop_df['standard_concept'] == 'S') & (omop_df['invalid_reason'].isna())
#     & (omop_df['vocabulary_id'] == 'SNOMED') & (omop_df['domain_id'].isin(['Procedure', 'Measurement', 'Observation', 'Device', 'Condition']))]
# RxNorm, RxNorm Extension = 2018838 rows
omop_df = omop_df[(omop_df['standard_concept'] == 'S') & (omop_df['invalid_reason'].isna())
    & (omop_df['vocabulary_id'].isin(['RxNorm', 'RxNorm Extension']))]
omop_df.loc[omop_df['concept_id'] == '36311145', 'concept_name'] = 'NA'
omop_df.loc[omop_df['concept_id'] == '45880107', 'concept_name'] = 'N/A'
# omop_df['FullDesc'] = omop_df['concept_name'].astype(str) + ' ' + omop_df['concept_synonym_name'].fillna('').str.replace('£', ' ')
# omop_df['concept_name'] = omop_df['concept_name'].str.lower()
omop_df

In [None]:
omop_df.columns

In [None]:
omop_df.domain_id.value_counts()

In [None]:
from langchain.document_loaders.dataframe import DataFrameLoader

In [None]:
omop_loader = DataFrameLoader(data_frame=omop_df, page_content_column='concept_name')
# omop_loader = DataFrameLoader(data_frame=omop_df, page_content_column='FullDesc')
omop_data = omop_loader.load()

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [None]:
# MiniLM-L6
# embedding_function = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
# all-mpnet-base-v2
# embedding_function = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2')
# SGPT-125M
# embedding_function = SentenceTransformerEmbeddings(model_name='Muennighoff/SGPT-125M-weightedmean-nli-bitfit')
# SGPT-1.3B
# embedding_function = SentenceTransformerEmbeddings(model_name='Muennighoff/SGPT-1.3B-weightedmean-nli-bitfit')
# thenlper/gte-large
embedding_function = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
# dmis-lab/biobert-base-cased-v1.2
# embedding_function = SentenceTransformerEmbeddings(model_name='dmis-lab/biobert-base-cased-v1.2')

In [None]:
from langchain.vectorstores import Chroma

In [None]:
def split_list(input_list, chunk_size):
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i:i + chunk_size]

In [None]:
import chromadb

In [None]:
chroma_client = chromadb.HttpClient(host='<CHROMA HOST>', port='<CHROMA PORT>')

In [None]:
# Split in chunks - Chroma SQLite limit
split_docs_chunked = split_list(omop_data, 10000)

for index, split_docs_chunk in enumerate(split_docs_chunked):
    print('Processing chunk index {}...'.format(index))
    chroma_db = Chroma.from_documents(
        collection_name='<collection name>',
        documents=split_docs_chunk,
        embedding=embedding_function,
        persist_directory='./<collection name>',
        client=chroma_client,
        collection_metadata={'hnsw:space': 'cosine'},
    )
    chroma_db.persist()
    print('Chunk {} persisted'.format(index))

#### Vector store connection

In [None]:
import chromadb
from langchain.vectorstores import Chroma

In [None]:
chroma_client = chromadb.HttpClient(host='<CHROMA HOST>', port='<CHROMA PORT>')

In [None]:
chroma_db = Chroma(
    collection_name='<collection name>',
    embedding_function=embedding_function,
    client=chroma_client,
    collection_metadata={'hnsw:space': 'cosine'},
)

In [None]:
chroma_db._collection.count()

#### Filtered candidate coverage

In [None]:
import ast
import pandas as pd

In [None]:
sigtap_df = pd.read_csv('<SIGTAP OMOP annotated>.csv', sep='\t', converters={'ID': str, 'sourceCode': str},
                        names=['ID', 'Name', 'Description', 'sourceCode', 'conceptId'], header=0)

In [None]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else None
    except (ValueError, SyntaxError):
        return []

In [None]:
sigtap_df['conceptId'] = sigtap_df['conceptId'].apply(safe_literal_eval)

In [None]:
sigtap_df['Name'] = sigtap_df['Name'].str.strip()

In [None]:
sigtap_df['NameAndDescription'] = sigtap_df['Name'].astype(str) + ' ' + sigtap_df['Description'].fillna('')
# sigtap_df['NameAndDescription'] = sigtap_df['NameAndDescription'].str.lower()

In [None]:
sigtap_df

In [None]:
sigtap_df.info()

In [None]:
# Total categories
sigtap_df['ID'].str[:2].value_counts()

In [None]:
# Annotated categories
# Medicines = 366
# Procedures = 514
sigtap_df[~sigtap_df.conceptId.isna()]['ID'].str[:2].value_counts()

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
COMPARISON_COLUMN = 'Name'
SEARCH_K = 1000 # Search depth
RESULTS_K = 1000 # Filtered candidates

In [None]:
# CATEGORY_MAPPING = {
#     '01': # Ações de promoção e prevenção em saúde
#     '02': # Procedimentos com finalidade diagnóstica
#     '03': # Procedimentos clínicos
#     '04': # Procedimentos cirúrgicos
#     '05': # Transplantes de órgãos, tecidos e células
#     '06': # Medicamentos
#     '07': # Órteses, próteses e materiais especiais
#     '08': # Ações complementares da atenção à saúde
# }

In [None]:
import numpy as np

from langchain.docstore.document import Document

In [None]:
def results_to_docs(results, with_embeddings=False):
    if with_embeddings:
        return [
            (Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
            for result in zip(
                results['documents'][0],
                results['metadatas'][0],
                results['distances'][0],
                results['embeddings'][0],
            )
        ]
    else:
        return [
            (Document(page_content=result[0], metadata=result[1] or {}), result[2])
            for result in zip(
                results['documents'][0],
                results['metadatas'][0],
                results['distances'][0],
            )
        ]

In [None]:
def search_concepts(row):
    # Query nearest vectors in store (return text, metadata, embeddings and distance
    # results = chroma_db.similarity_search_with_score(row[COMPARISON_COLUMN], k=SEARCH_K)[:RESULTS_K]
    results = chroma_db._collection.query(
        query_texts=[row[COMPARISON_COLUMN]],
        n_results=SEARCH_K,
        include=['documents', 'metadatas', 'distances'],
    )
    results = results_to_docs(results)[:RESULTS_K]
    # Extract concept ID list from results (all and filtered)
    concept_list = [result[0].metadata.get('concept_id') for result in results]
    # Check concept ID intersection with annotated dataset (all and filtered)
    matched = bool(set(concept_list).intersection(set(row.conceptId))) if row.conceptId else False
    return (
        # Procedures
        # [(result[0].metadata.get('concept_name'), result[0].metadata.get('concept_synonym_name'), result[0].metadata.get('concept_id'), result[1]) for result in results],
        # Medicines
        [(result[0].page_content, result[0].metadata.get('concept_id'), result[1]) for result in results],
        matched,
    )

#### Procedures

In [None]:
# 4362 rows (SIGTAP procedures)
sigtap_procedures = sigtap_df[sigtap_df.ID.str[:2] != '06'].copy()

In [None]:
# Annotated 514 SIGTAP procedures
len(sigtap_procedures[~sigtap_procedures.conceptId.isna()])

In [None]:
sigtap_procedures[['nearest_concepts', 'matched']] = sigtap_procedures[~sigtap_procedures.conceptId.isna()].progress_apply(search_concepts, axis=1, result_type='expand')

In [None]:
print('Total annotated = ', len(sigtap_procedures[~sigtap_procedures.conceptId.isna()]))
print('Number of matches = ', len(sigtap_procedures[sigtap_procedures.matched == True]))

In [None]:
sigtap_procedures[(sigtap_procedures.matched == False) & (~sigtap_procedures.conceptId.isna())]

In [None]:
sigtap_procedures.to_csv('sigtap_procedures_gte_candidates.csv', sep='\t', index=False)

In [None]:
def extract_element_from_tuple_list(tuple_list):
    return [(t[0].metadata.get('concept_id'), t[1]) for t in tuple_list]

In [None]:
sigtap_procedures['nearest_concepts'] = sigtap_procedures['nearest_concepts'].apply(
    lambda x: extract_element_from_tuple_list(x)
)

#### Medicines

In [None]:
# 369 rows (SIGTAP meds)
sigtap_meds = sigtap_df[sigtap_df.ID.str[:2] == '06'].copy()

In [None]:
# sigtap_meds['Name'] = sigtap_meds['Name'].str.lower()

In [None]:
# Annotated 514 SIGTAP meds
len(sigtap_meds[~sigtap_meds.conceptId.isna()])

In [None]:
sigtap_meds[['nearest_concepts', 'matched']] = sigtap_meds.progress_apply(search_concepts, axis=1, result_type='expand')

In [None]:
print('Total annotated = ', len(sigtap_meds[~sigtap_meds.conceptId.isna()]))
print('Number of matches (all candidates) = ', len(sigtap_meds[sigtap_meds.matched == True]))

In [None]:
sigtap_meds[(sigtap_meds.matched == False) & (~sigtap_meds.conceptId.isna())]

In [None]:
sigtap_meds.to_csv('sigtap_meds_gte_candidates.csv', sep='\t', index=False)

#### Fuzzy-based candidate filtering

In [None]:
!pip install python-Levenshtein

In [None]:
import Levenshtein

In [None]:
def calculate_levenshtein_distance(str1, str2):
    return jellyfish.levenshtein_distance(str1.lower(), str2.lower())
    # return jellyfish.damerau_levenshtein_distance(str1.lower(), str2.lower())

In [None]:
from nltk.metrics import jaccard_distance

In [None]:
def calculate_jaccard_similarity(set1, set2):
    set1 = set(set1.lower().split())
    set2 = set(set2.lower().split())
    return 1 - jaccard_distance(set1, set2)

In [None]:
import jellyfish

In [None]:
def calculate_jaro_winkler_distance(str1, str2):
    return 1 - jellyfish.jaro_similarity(str1.lower(), str2.lower())

In [None]:
from fuzzywuzzy import fuzz

In [None]:
def calculate_fuzzy_distance(str1, str2):
    # return fuzz.ratio(str1.lower(), str2.lower())
    # return fuzz.partial_ratio(str1.lower(), str2.lower())
    # return fuzz.token_sort_ratio(str1.lower(), str2.lower())
    return fuzz.token_set_ratio(str1.lower(), str2.lower())
    # return jellyfish.levenshtein_distance(str1.lower(), str2.lower())

In [None]:
def rank_candidates_procs(row):
    name = row['Name']
    # name = row['Name']
    candidates = row['nearest_concepts']

    # distances = [(candidate[0], candidate[2], calculate_fuzzy_distance(name, candidate[0])) for candidate in candidates]
    distances = [(candidate[0], candidate[2], calculate_fuzzy_distance(name, f'{candidate[0]} {candidate[1].replace(" £ ", " ")}')) for candidate in candidates]
    # distances = [(candidate[0], candidate[2], candidate[3]) for candidate in candidates]

    sorted_distances = sorted(distances, key=lambda x: x[2], reverse=True)[:50]

    sorted_candidates = [candidate[1] for candidate in sorted_distances]

    ranked_matched = bool(set(sorted_candidates).intersection(set(row.conceptId))) if row.conceptId else False

    return (sorted_distances, ranked_matched)

In [None]:
def rank_candidates(row):
    name = row['Name']
    candidates = row['nearest_concepts']

    distances = [(candidate[0], candidate[1], calculate_fuzzy_distance(name, candidate[0])) for candidate in candidates]

    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=True)[:50]

    sorted_candidates = [candidate[1] for candidate in sorted_distances]

    ranked_matched = bool(set(sorted_candidates).intersection(set(row.conceptId))) if row.conceptId else False

    return (sorted_distances, ranked_matched)

In [None]:
sigtap_procedures[['ranked_concepts', 'ranked_matched']] = sigtap_procedures.progress_apply(rank_candidates_procs, axis=1, result_type='expand')

In [None]:
print('Number of matches (filtered candidates) = ', len(sigtap_procedures[sigtap_procedures.ranked_matched == True]))

In [None]:
sigtap_meds[['ranked_concepts', 'ranked_matched']] = sigtap_meds.progress_apply(rank_candidates, axis=1, result_type='expand')

In [None]:
print('Number of matches (filtered candidates) = ', len(sigtap_meds[sigtap_meds.ranked_matched == True]))

In [None]:
sigtap_meds.to_csv('sigtap_meds_candidates.csv', sep='\t', index=False)

#### Filtered candidates

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
import ast
import pandas as pd

In [None]:
index_to_data = omop_df.set_index('concept_id')['concept_name'].to_dict()

In [None]:
sigtap_meds = pd.read_csv('sigtap_meds_candidates.csv', sep='\t', header=0,
                         converters={'ID': str, 'sourceCode': str})

In [None]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else None
    except (ValueError, SyntaxError):
        return []

In [None]:
sigtap_meds['conceptId'] = sigtap_meds['conceptId'].apply(safe_literal_eval)
sigtap_meds['nearest_concepts'] = sigtap_meds['nearest_concepts'].apply(safe_literal_eval)

In [None]:
sigtap_procedures = pd.read_csv('sigtap_procedures_candidates.csv', sep='\t', header=0,
                         converters={'ID': str, 'sourceCode': str})

In [None]:
sigtap_procedures['conceptId'] = sigtap_procedures['conceptId'].apply(safe_literal_eval)
sigtap_procedures['nearest_concepts'] = sigtap_procedures['nearest_concepts'].apply(safe_literal_eval)

In [None]:
def query_concept_name(concepts):
    return [(concept[0],
             index_to_data[concept[0]],
            concept[1]) for concept in concepts]

In [None]:
sigtap_meds['nearest_concepts'] = sigtap_meds['nearest_concepts'].progress_apply(query_concept_name)

In [None]:
sigtap_procedures['nearest_concepts'] = sigtap_procedures['nearest_concepts'].progress_apply(query_concept_name)

In [None]:
sigtap_meds.to_csv('sigtap_meds_candidates_names.csv', sep='\t', index=False)

In [None]:
sigtap_procedures.to_csv('sigtap_procedures_candidates_names.csv', sep='\t', index=False)

#### Fuzzy matching baseline

In [None]:
!pip install thefuzz==0.20.0

In [None]:
from thefuzz import fuzz
import jellyfish

In [None]:
def calculate_fuzzy_distance(str1, str2):
    # return fuzz.ratio(str1.lower(), str2.lower())
    # return fuzz.partial_ratio(str1.lower(), str2.lower())
    # return fuzz.token_sort_ratio(str1.lower(), str2.lower())
    return fuzz.token_set_ratio(str1.lower(), str2.lower())
    # return jellyfish.levenshtein_distance(str1.lower(), str2.lower())

In [None]:
def rank_candidates(row):
    name = row['Name']

    omop_df['fuzzy_distance'] = omop_df['concept_name'].apply(lambda concept_name: calculate_fuzzy_distance(name, concept_name))

    distances = [(candidate.concept_id, candidate.concept_name,
                  candidate.fuzzy_distance) for index, candidate in omop_df.nlargest(50, 'fuzzy_distance').iterrows()]

    sorted_candidates = [candidate[0] for candidate in distances]

    ranked_matched = bool(set(sorted_candidates).intersection(set(row.conceptId))) if row.conceptId else False

    return (distances, ranked_matched)

In [None]:
sigtap_meds[['ranked_concepts', 'ranked_matched']] = sigtap_meds.progress_apply(rank_candidates, axis=1, result_type='expand')

In [None]:
print('Number of matches (filtered candidates) = ', len(sigtap_meds[sigtap_meds.ranked_matched == True]))

In [None]:
sigtap_meds.to_csv('sigtap_meds_fuzzy_candidates.csv', sep='\t', index=False)