# Similarity Search for Keyword of Medical Terminology 
## Two Vector Databases: FAISS and Chroma
### 1. Vector library: FAISS


In [1]:
# %pip install faiss-cpu==1.7.4 chromadb==0.3.21
# !pip install sentence_transformers

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import pipeline

med_dataset = load_dataset("gamino/wiki_medical_terms", cache_dir="../datasets/")  
med_dataset  

Found cached dataset parquet (C:/Users/Sealion/Desktop/SOS_JOB/datasets/gamino___parquet/gamino--wiki_medical_terms-a58e77d291ee0a19/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['page_title', 'page_text', '__index_level_0__'],
        num_rows: 6861
    })
})

In [3]:
med_df = pd.DataFrame.from_dict(med_dataset['train'])
med_df.head(5)

Unnamed: 0,page_title,page_text,__index_level_0__
0,Paracetamol poisoning,"Paracetamol poisoning, also known as acetamino...",0
1,Acromegaly,Acromegaly is a disorder that results from exc...,1
2,Actinic keratosis,"Actinic keratosis (AK), sometimes called solar...",2
3,Congenital adrenal hyperplasia,Congenital adrenal hyperplasia (CAH) is a grou...,3
4,Adrenocortical carcinoma,Adrenocortical carcinoma (ACC) is an aggressi...,4


In [4]:
med_df['page_text'][1]

'Acromegaly is a disorder that results from excess growth hormone (GH) after the growth plates have closed. The initial symptom is typically enlargement of the hands and feet. There may also be an enlargement of the forehead, jaw, and nose. Other symptoms may include joint pain, thicker skin, deepening of the voice, headaches, and problems with vision. Complications of the disease may include type 2 diabetes, sleep apnea, and high blood pressure.Acromegaly is usually caused by the pituitary gland producing excess growth hormone. In more than 95% of cases the excess production is due to a benign tumor, known as a pituitary adenoma. The condition is not inherited from a persons parents. Acromegaly is rarely due to a tumor in another part of the body. Diagnosis is by measuring growth hormone after a person has consumed a glucose solution, or by measuring insulin-like growth factor I in the blood. After diagnosis, medical imaging of the pituitary is carried out to determine if an adenoma i

In [5]:
med_df["id"] = med_df.index
display(med_df)

Unnamed: 0,page_title,page_text,__index_level_0__,id
0,Paracetamol poisoning,"Paracetamol poisoning, also known as acetamino...",0,0
1,Acromegaly,Acromegaly is a disorder that results from exc...,1,1
2,Actinic keratosis,"Actinic keratosis (AK), sometimes called solar...",2,2
3,Congenital adrenal hyperplasia,Congenital adrenal hyperplasia (CAH) is a grou...,3,3
4,Adrenocortical carcinoma,Adrenocortical carcinoma (ACC) is an aggressi...,4,4
...,...,...,...,...
6856,Gephyrophobia,Gephyrophobia is the anxiety disorder or speci...,7271,6856
6857,Coronary artery bypass surgery,"Coronary artery bypass surgery, also known as ...",7272,6857
6858,Unemployment,"Unemployment, according to the OECD (Organisat...",7273,6858
6859,Surgical instrument,A surgical instrument is a tool or device for ...,7274,6859


In [6]:
from sentence_transformers import InputExample
med_data = med_df.head(1000)
def example_create_fn(doc1: pd.Series) -> InputExample:
    return InputExample(texts=[doc1])

faiss_train_examples = med_data.apply(lambda x: example_create_fn(x["page_text"]), axis=1).tolist()

In [7]:
# faiss_train_examples

In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(
    "all-MiniLM-L6-v2", 
    cache_folder="../datasets/"
) 

In [9]:
faiss_title_embedding = model.encode(med_data['page_text'].values.tolist())
len(faiss_title_embedding), len(faiss_title_embedding[0])

(1000, 384)

In [10]:
faiss_title_embedding

array([[ 0.05954915, -0.0980249 , -0.00072011, ..., -0.00089188,
        -0.00025767,  0.04045081],
       [-0.02811945,  0.02960755, -0.00459793, ..., -0.00498677,
         0.03072072, -0.0202143 ],
       [-0.04736273, -0.047802  ,  0.00530074, ..., -0.02512635,
        -0.01709105,  0.12578441],
       ...,
       [ 0.00785826,  0.02391542, -0.03233958, ..., -0.02174725,
        -0.08354437,  0.02829496],
       [ 0.01757784, -0.02052802,  0.0084214 , ...,  0.0577838 ,
         0.11799286,  0.02630357],
       [ 0.09343956,  0.02290077, -0.07832339, ..., -0.00363539,
         0.06051032, -0.04366893]], dtype=float32)

In [11]:
import numpy as np
import faiss
## Save emdbedding vectors to FAISS index
med_data_to_index = med_data.set_index(["id"], drop=False)
id_index = np.array(med_data.id.values).flatten().astype("int")

content_encoded_normalized = faiss_title_embedding.copy()
faiss.normalize_L2(content_encoded_normalized)
## Build index
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_title_embedding[0])))
index_content.add_with_ids(content_encoded_normalized, id_index)

In [12]:
def search_content(query, pdf_to_index, k=100):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    # We set k to limit the number of vectors we want to return
    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = pdf_to_index.loc[ids]
    results["similarities"] = similarities
    return results

In [13]:
## Search keyword ('high blood pressure') related context.
display(search_content("high blood pressure", med_data))

Unnamed: 0,page_title,page_text,__index_level_0__,id,similarities
997,Hypotension,Hypotension is low blood pressure. Blood press...,997,997,0.543309
730,Reflex bradycardia,Reflex bradycardia is a bradycardia (decrease ...,730,730,0.458467
682,Orthostatic hypotension,"Orthostatic hypotension, also known as postura...",682,682,0.427391
749,Primary aldosteronism,"Primary aldosteronism (PA), also known as prim...",749,749,0.426620
292,Pulmonary hypertension,Pulmonary hypertension (PH or PHTN) is a condi...,292,292,0.406415
...,...,...,...,...,...
30,Beta blocker,"Beta blockers, also spelled β-blockers, are a ...",30,30,0.192487
227,Malignant hyperthermia,Malignant hyperthermia (MH) is a type of sever...,227,227,0.190498
475,Livedo reticularis,Livedo reticularis is a common skin finding co...,475,475,0.190146
487,Precordium,"In anatomy, the precordium or praecordium is t...",487,487,0.189950


In [14]:
query_vector = model.encode(["high blood pressure"])
# print(query_vector)
faiss.normalize_L2(query_vector)

In [15]:
top_k = index_content.search(query_vector, 10)
print(top_k)
ids = top_k[1][0].tolist()
print(ids)
similarities = top_k[0][0].tolist()
print(similarities)

(array([[0.54330915, 0.45846665, 0.42739114, 0.4266203 , 0.40641522,
        0.38544655, 0.3498581 , 0.34232524, 0.34155926, 0.32575178]],
      dtype=float32), array([[997, 730, 682, 749, 292, 856, 665, 558, 506,  69]], dtype=int64))
[997, 730, 682, 749, 292, 856, 665, 558, 506, 69]
[0.5433091521263123, 0.45846664905548096, 0.4273911416530609, 0.4266203045845032, 0.4064152240753174, 0.38544654846191406, 0.3498581051826477, 0.34232524037361145, 0.3415592610836029, 0.32575178146362305]


In [16]:
results = med_data_to_index.loc[ids]
results["similarities"] = similarities
display(results)

Unnamed: 0_level_0,page_title,page_text,__index_level_0__,id,similarities
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
997,Hypotension,Hypotension is low blood pressure. Blood press...,997,997,0.543309
730,Reflex bradycardia,Reflex bradycardia is a bradycardia (decrease ...,730,730,0.458467
682,Orthostatic hypotension,"Orthostatic hypotension, also known as postura...",682,682,0.427391
749,Primary aldosteronism,"Primary aldosteronism (PA), also known as prim...",749,749,0.42662
292,Pulmonary hypertension,Pulmonary hypertension (PH or PHTN) is a condi...,292,292,0.406415
856,Secondary hypertension,"Secondary hypertension (or, less commonly, ine...",856,856,0.385447
665,Hyperproteinemia,Hyperproteinemia is the state of having overly...,665,665,0.349858
558,Pseudohyperaldosteronism,Pseudohyperaldosteronism (also pseudoaldostero...,558,558,0.342325
506,Tachycardia,"Tachycardia, also called tachyarrhythmia, is a...",506,506,0.341559
69,Altitude sickness,"Altitude sickness, the mildest form being acut...",69,69,0.325752


### 2. Vector Database: Chroma

In [59]:
# !pip install chromadb openai 
# cache_dir="../datasets/"

In [60]:
med_dataset = load_dataset("rungalileo/medical_transcription_4", cache_dir="../datasets/")  
med_dataset  

Found cached dataset parquet (C:/Users/Sealion/Desktop/SOS_JOB/datasets/rungalileo___parquet/rungalileo--medical_transcription_4-cc509920750fa75b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 4499
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})

In [61]:
med_sample = pd.DataFrame.from_dict(med_dataset['train'])
med_sample.head(5)

Unnamed: 0,id,text,label
0,0,"2-D STUDY,1. Mild aortic stenosis, widely calc...",0
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...",3
2,2,"CHIEF COMPLAINT:, The patient comes for three...",0
3,3,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...",3
4,4,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...",0


In [62]:
class_dict = {0: "Medical Records", 1:"Other", 2:"Internal Medicine", 3: "Surgery"}
med_sample['class'] = med_sample['label'].map(class_dict)
med_sample = med_sample[['id', 'text', 'class']]
med_sample.columns = ['id', 'document', 'class']
med_sample

Unnamed: 0,id,document,class
0,0,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Medical Records
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...",Surgery
2,2,"CHIEF COMPLAINT:, The patient comes for three...",Medical Records
3,3,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...",Surgery
4,4,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...",Medical Records
...,...,...,...
4494,4494,"CHIEF COMPLAINT:, Headaches.,HEADACHE HISTORY:...",Medical Records
4495,4495,"DATE OF INJURY : October 4, 2000,DATE OF EXAMI...",Medical Records
4496,4496,"PREOPERATIVE DIAGNOSES:,1. Lumbar osteomyelit...",Other
4497,4497,"PREOPERATIVE DIAGNOSIS:, Prostate cancer.,POS...",Surgery


In [63]:
med_sample['class'].value_counts()

Surgery              1442
Medical Records      1126
Internal Medicine    1040
Other                 891
Name: class, dtype: int64

In [64]:
import chromadb
from chromadb.config import Settings
chroma_client = chromadb.Client(Settings(persist_directory="../datasets/"))

In [65]:
collection_name = "blood"
# If you have created the collection before, you need delete the collection first
if len(chroma_client.list_collections()) > 0 and collection_name in [
    chroma_client.list_collections()[0].name
]:
    chroma_client.delete_collection(name=collection_name)
else:
    print(f"Creating collection: '{collection_name}'")
    collection = chroma_client.create_collection(name=collection_name)

Creating collection: 'blood'


In [66]:
collection.add(
    documents=med_sample["document"][:100].tolist(),
    metadatas=[{"topic": topic} for topic in med_sample["class"][:100].tolist()],
    ids=[f"id{x}" for x in range(100)],
)

In [67]:
import json
results = collection.query(query_texts=["high blood pressure"], n_results=10)
print(json.dumps(results, indent=4))

{
    "ids": [
        [
            "id63",
            "id73",
            "id23",
            "id19",
            "id70",
            "id57",
            "id26",
            "id0",
            "id11",
            "id59"
        ]
    ],
    "distances": [
        [
            1.4575092792510986,
            1.4775474071502686,
            1.5190160274505615,
            1.5684528350830078,
            1.5876195430755615,
            1.5903375148773193,
            1.631920576095581,
            1.670737862586975,
            1.6719961166381836,
            1.6817744970321655
        ]
    ],
    "metadatas": [
        [
            {
                "topic": "Medical Records"
            },
            {
                "topic": "Medical Records"
            },
            {
                "topic": "Internal Medicine"
            },
            {
                "topic": "Internal Medicine"
            },
            {
                "topic": "Internal Medicine"
            },
  

In [68]:
## Querry
collection.query(query_texts=["fever"], where={"topic": "Internal Medicine"}, n_results=10)

{'ids': [['id36',
   'id10',
   'id33',
   'id40',
   'id23',
   'id19',
   'id26',
   'id77',
   'id86',
   'id22']],
 'distances': [[1.3553173542022705,
   1.4489495754241943,
   1.6044704914093018,
   1.60764741897583,
   1.6720105409622192,
   1.6940916776657104,
   1.700006127357483,
   1.7105464935302734,
   1.738896369934082,
   1.747157335281372]],
 'metadatas': [[{'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'},
   {'topic': 'Internal Medicine'}]],
 'embeddings': None,
 'documents': [['CHIEF COMPLAINT:,  Itchy rash.,HISTORY OF PRESENT ILLNESS: , This 34-year-old gentleman awoke this morning noting some itchiness to his back and then within very a short period of time realized that he had an itchy rash all over his torso