# Import Libraries

In [1]:
import time
from datasets import load_from_disk
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
import faiss
from usearch.index import Index

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
medical_dataset = load_from_disk("/workspaces/senetence-transformer-in-action/medical_terms")

```Text
Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.
Int8 embedding is required to perform rescoring of fetched document. This is done bt performing inner product with F32 embedding of Query
```
[Efficient Passage Retrieval with Hashing for Open-domain Question Answering](https://arxiv.org/abs/2106.00882)

In [3]:
# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.
int8_view = Index.restore("/workspaces/senetence-transformer-in-action/medicine_details_int8_usearch.index", view=True)
binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary("/workspaces/senetence-transformer-in-action/medicine_details.index")

# Import Model to generate embedding

In [4]:
model = SentenceTransformer(
    "mixedbread-ai/mxbai-embed-large-v1",
    prompts={
        "retrieval": "Represent this sentence for searching relevant passages: ",
    },
    default_prompt_name="retrieval",
)


Default prompt name is set to 'retrieval'. This prompt will be applied to all `encode()` calls, except if `encode()` is called with `prompt` or `prompt_name` parameters.


In [5]:
def search(query, top_k: int = 5):
    # 1. Embed the query as float32
    query_embedding = model.encode(query)

    # 2. Quantize the query to ubinary. To perform actual search with faiss
    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")


    # 3. Search the binary index 
    index =  binary_index
    _scores, binary_ids = index.search(query_embedding_ubinary, top_k)
    binary_ids = binary_ids[0]


    # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.
    int8_embeddings = int8_view[binary_ids].astype(int)

    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
    scores = query_embedding @ int8_embeddings.T

    # 6. Sort the scores and return the top_k
    start_time = time.time()
    indices = scores.argsort()[::-1][:top_k]
    top_k_indices = binary_ids[indices]
    top_k_scores = scores[indices]
    top_k_Medicine_Name, top_k_Composition,top_k_Uses,top_k_Side_effects = zip(
        *[(medical_dataset[idx]["Medicine Name"], medical_dataset[idx]["Composition"],
           medical_dataset[idx]["Uses"], medical_dataset[idx]["Side_effects"]) for idx in top_k_indices.tolist()]
    )
    df = pd.DataFrame(
        {"Score": [round(value, 2) for value in top_k_scores], "Medicine_Name": top_k_Medicine_Name, "Composition": top_k_Composition,
                                                                "Uses": top_k_Uses, "Side_effects": top_k_Side_effects}
    )

    return df


In [6]:
search('Dizziness Back pain Sinus infection Diarrhea')

Unnamed: 0,Score,Medicine_Name,Composition,Uses,Side_effects
0,3835.73,Redotil 100mg Capsule,Racecadotril (100mg),Treatment of Diarrhea,Vomiting Dizziness General discomfort Headache
1,3784.72,Sinus 77 Tablet,Caffeine (15mg) + Chlorpheniramine Maleate (4m...,Treatment of Common cold,Nausea Insomnia difficulty in sleeping Restles...
2,3215.35,Mesalo OD Tablet,Mesalazine (1200mg),Ulcerative colitis Crohn’s disease,Stomach pain epigastric pain Flatulence Headac...
3,3010.67,L-Cin Syrup,Levofloxacin (125mg),Treatment of Bacterial infections,Headache Dizziness Nausea Constipation Diarrhea
4,2812.04,Anleo-DSR Capsule,Domperidone (30mg) + Pantoprazole (40mg),Treatment of Gastroesophageal reflux disease (...,Diarrhea Stomach pain Flatulence Dryness in mo...
