# 🧠 RAG Search Demo Without ChromaDB

This notebook demonstrates how to perform semantic similarity search over clinical notes **without using ChromaDB**, using a CSV file with precomputed embeddings and documents.

## Step 1: Load consolidated CSV and decode vectors
We decode the hex-encoded embeddings back into vectors and load document metadata.

In [5]:
import csv
import struct
import binascii
import numpy as np
import pandas as pd
import json

def load_consolidated_vectors(filepath):
    documents = []
    embeddings = []
    metadata = []

    with open(filepath, newline='', encoding="utf-8") as csvfile:
        # reader = csv.reader(csvfile)
        reader = pd.read_csv("consolidated_table.csv")

        for row in reader:
            patient_num, visit_date, doc_id, embed_id, hex_str, dtype, json_doc = row
            hex_data = hex_str[2:] if hex_str.startswith("0x") else hex_str
            binary_data = binascii.unhexlify(hex_data)
            vector = struct.unpack(f'{len(binary_data) // 4}f', binary_data)
            embeddings.append(np.array(vector))
            documents.append(json.loads(json_doc)["chroma:document"])
            metadata.append({
                "patient_num": patient_num,
                "visit_date": visit_date,
                "doc_id": doc_id,
                "embedding_id": embed_id
            })
    return documents, embeddings, metadata

# Load your file here
csv_path = "./data_prep/patient_notes_hex/consolidated_table.csv"
documents, vectors, metadata = load_consolidated_vectors(csv_path)
print(f"Loaded {len(documents)} clinical notes with embeddings.")

ValueError: too many values to unpack (expected 7)

## Step 2: Embed your query using AzureOpenAIEmbeddings

In [None]:
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

embedding_model = AzureOpenAIEmbeddings(
    model=os.getenv("AZURE_EMBEDDING_MODEL"),
    azure_deployment=os.getenv("AZURE_EMBEDDING_DEPLOYMENT"),
    api_version=os.getenv("AZURE_EMBEDDING_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_EMBEDDING_ENDPOINT"),
    azure_ad_token_provider=None  # Replace with real token provider if needed
)

query = "Who has asthma and is taking Fluticasone?"
query_vector = embedding_model.embed_query(query)

## Step 3: Perform cosine similarity search

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_similar(query_vector, all_vectors, top_k=10, score_threshold=0.45):
    similarities = cosine_similarity([query_vector], all_vectors)[0]
    top_indices = similarities.argsort()[::-1]
    results = []
    for idx in top_indices:
        score = similarities[idx]
        if score < score_threshold:
            continue
        results.append((idx, score))
        if len(results) >= top_k:
            break
    return results

results = search_similar(query_vector, vectors, top_k=10, score_threshold=0.45)
print(f"Top {len(results)} results above threshold:")

## Step 4: Display Retrieved Documents

In [None]:
for rank, (idx, score) in enumerate(results, 1):
    print(f"Document {rank}:")
    print(f"  Relevance Score: {score:.6f}")
    print(f"  Patient Num: {metadata[idx]['patient_num']}")
    print(f"  Visit Date: {metadata[idx]['visit_date']}")
    print(f"  Document ID: {metadata[idx]['doc_id']}")
    print(f"  Excerpt: {documents[idx][:500]}...")
    print("-" * 100)

## (Optional) Step 5: Visualize Similarity Scores as Heatmap

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sim_matrix = cosine_similarity([query_vector], vectors)

plt.figure(figsize=(12, 4))
sns.heatmap(sim_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False)
plt.title("Query-to-All Vectors Cosine Similarity")
plt.show()