### Creates Vector embeddings and FAISS index from knowledge base

In [1]:
import json
import faiss
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RAG_DIR = Path("../data/rag/")
FAISS_DIR = Path("../data/faiss/")
FAISS_DIR.mkdir(parents=True, exist_ok=True)

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions, fast
# Alternative: "sentence-transformers/all-mpnet-base-v2"  # 768 dimensions, better quality

In [3]:
def load_knowledge_base():
	print("Loading knowledge base...")
	kb_path = RAG_DIR / "knowledge_base.json"
	# kb_path = RAG_DIR / "knowledge_base.jsonl"
	
	if not kb_path.exists():
		raise FileNotFoundError(
			f"Knowledge base not found at {kb_path}. "
			"Please run data preprocessing first."
		)
	
	with open(kb_path, 'r', encoding='utf-8') as f:
		knowledge_base = json.load(f)
	
	print(f"Loaded {len(knowledge_base)} knowledge base entries.")
	return knowledge_base

In [4]:
def create_embeddings(knowledge_base):
  print("Creating embeddings...")
  model = SentenceTransformer(EMBEDDING_MODEL)
  print(f"Model loaded dimension: {model.get_sentence_embedding_dimension()}")

  # prepare text for embedding 
  texts = []
  metadata = []

  for chunk in knowledge_base:
    text_to_embed = f"{chunk['drug_name']} - {chunk['category']}: {chunk['text']}"
    texts.append(text_to_embed)

    # store metadata for later retrieval
    metadata.append({
      'drug_name': chunk['drug_name'],
      'category': chunk['category'],
      'section_title': chunk.get('section_title', ''), # if null in json return empty string
      'text': chunk['text'],
      'source': chunk.get('source', '') # if null in json return empty string
    })

  print(f"Encoding {len(texts)} texts...")
  embeddings = model.encode(
    texts,
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
  )

  print(f"Created embeddings with shape: {embeddings.shape}")
  return embeddings, metadata, model 

In [5]:
def build_faiss_index(embeddings):
  print("Building FAISS index...")
  dimension = embeddings.shape[1]
  n_embeddings = embeddings.shape[0]
  print(f"Dimension: {dimension}")
  print(f"Number of embeddings: {n_embeddings}")

  if n_embeddings < 10000:
    index = faiss.IndexFlatL2(dimension)  # exact search
    print("Using IndexFlatL2 for exact search.")
  else:
    # for larger datasets, use an approximate index like IndexIVFFlat
    nlist = min(100, n_embeddings // 10)  # number of clusters
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
    print(f"Using IndexIVFFlat for approximate search with {nlist} clusters.")
    # train the index
    print("Training the index...")
    index.train(embeddings)

  # add vectors to the index
  print("Adding embeddings to the index...")
  index.add(embeddings)
  print(f"FAISS index build with {index.ntotal} vectors.")

  return index

In [6]:
def save_index_and_metadata(index, metadata, model):
  print("Saving FAISS index and metadata...")
  # save faiss index
  index_path = FAISS_DIR / "drug_knowledge.index"
  faiss.write_index(index, str(index_path))
  print(f"FAISS index saved to {index_path} ")

  # save metada
  metadata_path = FAISS_DIR / "metadata.pkl"
  with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
  print(f"metadata saved to {metadata_path} ")

  # save configuration 
  config = {
    'embedding_model': EMBEDDING_MODEL,
    'embedding_dimension': model.get_sentence_embedding_dimension(),
    'num_chunks' : len(metadata),
    'index_type': type(index).__name__
  }

  config_path = FAISS_DIR / "config.json"
  with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
  print(f"Configuration saved to {config_path}")

In [7]:
def test_retrieval(index, metadata, model):
  test_queries = [
    "what is the dosage for ibuprofen",
    "side effects of acetaminophen",
    "how to take amoxicillin",
    "Contraindications of aspirin"
  ]

  for query in test_queries:
    print(f"Query: {query}")

    # encode query 
    query_embedding = model.encode([query])

    # search top K
    k = 3
    distance, indices = index.search(query_embedding, k)

    # display results
    for i, (dist, idx) in enumerate(zip(distance[0], indices[0])):
      if idx < len(metadata):
        result = metadata[idx]
        print(f"[Result {i+1}] Distance: {dist:.4f}")
        print(f"Drug: {result['drug_name']}")
        print(f"Category: {result['category']}")
        print(f"Text: {result['text'][:200]}...")

### Run all function above

In [8]:
# load knowledge base
knowledge_base = load_knowledge_base()
# create embeddings
embeddings, metadata, model = create_embeddings(knowledge_base)
# build faiss index
index = build_faiss_index(embeddings)
# save index and metadata
save_index_and_metadata(index, metadata, model)

# test retrieval
test_retrieval(index, metadata, model)

Loading knowledge base...
Loaded 90 knowledge base entries.
Creating embeddings...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded dimension: 384
Encoding 90 texts...


Batches: 100%|██████████| 3/3 [00:00<00:00,  4.03it/s]

Created embeddings with shape: (90, 384)
Building FAISS index...
Dimension: 384
Number of embeddings: 90
Using IndexFlatL2 for exact search.
Adding embeddings to the index...
FAISS index build with 90 vectors.
Saving FAISS index and metadata...
FAISS index saved to ..\data\faiss\drug_knowledge.index 
metadata saved to ..\data\faiss\metadata.pkl 
Configuration saved to ..\data\faiss\config.json
Query: what is the dosage for ibuprofen
[Result 1] Distance: 0.8695
Drug: Ibuprofen
Category: dosage
Text: Atorvastatin Calcium Tablets, USP: • 10 mg of atorvastatin: white to off-white, oval, biconvex film coated tablets debossed with '10' on one side and 'A 53' on other side • 20 mg of atorvastatin: whit...
[Result 2] Distance: 0.8968
Drug: Ibuprofen
Category: dosage
Text: No specific antidotes for atorvastatin calcium are known. Contact Poison Control (1-800-222-1222) for latest recommendations. Due to extensive drug binding to plasma proteins, hemodialysis is not expe...
[Result 3] Distance: 


