In [None]:
# Load libraries
import os
import sys
import chromadb
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from scripts.data_loader import load_medicare_data

# assuming notebook is in "project/notebooks" and modules are in "project/scripts"
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)


In [3]:
# Environment variables

# Load environment variables from the secrets.env file.
load_dotenv("secrets.env")

# Retrieve API keys from environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

In [6]:
# # Add project root to Python path
# PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
# sys.path.append(PROJECT_ROOT)

# Load dataset
# data = load_medicare_data()
file_path = 'data/processed/sample_ny_data.csv'

if not os.path.exists(file_path):
    # If file does not exist, create new sample
    sample_ny_data = load_medicare_data()
    print(f"Creating new sample file: {file_path} with size: {len(sample_ny_data)}")
else:
    # If file exists, read it instead of creating new sample
    sample_ny_data = pd.read_csv(file_path)
    print(f"Loading existing sample file: {file_path} with size: {len(sample_ny_data)}")

Creating new sample file: data/processed/sample_ny_data.csv with size: 50000


In [None]:
from tqdm import tqdm
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

# To reuse our Chroma index in LangChain, we can use the same persist_directory.
# Here, we assume you persist the index to a local directory.
# (Alternatively, you could wrap the existing collection—but LangChain’s Chroma class offers a higher-level interface.)
PERSIST_DIR = "./chroma_db"

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Instantiate a Chroma client
client = chromadb.Client(Settings())

# Create or load the vector store directly using LangChain's Chroma
vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedding_model,
    collection_name="new_york_medicare"
)

  vectorstore = Chroma(


In [9]:
sample_ny_data = sample_ny_data.head(500)

In [10]:
def create_embedding_text(row):
    # Construct the provider's name (combining first name and last/organization name)
    first_name = row.get("Rndrng_Prvdr_First_Name", "")
    last_org = row.get("Rndrng_Prvdr_Last_Org_Name", "")
    provider_name = f"{first_name} {last_org}".strip()
    
    # Extract key service details
    hcpcs_desc = row.get("HCPCS_Desc", "")
    
    # Instead of Place_Of_Srvc (which indicates facility type), use city and state for location context.
    city = row.get("Rndrng_Prvdr_City", "")
    state = row.get("Rndrng_Prvdr_State_Abrvtn", "")
    location = f"{city}, {state}".strip(", ")
    
    # Create the embedding text that includes key information.
    embedding_text = f"Provider: {provider_name}. Service: {hcpcs_desc}. Location: {location}."
    return embedding_text

# Build Document objects directly from the DataFrame rows.
docs = []
doc_ids = []

for i, row in tqdm(sample_ny_data.iterrows(), total=len(sample_ny_data), desc="Processing rows"):
    text = create_embedding_text(row)
    unique_id = f"{row.get('Rndrng_NPI', 'unknown')}_{i}"
    docs.append(Document(page_content=text, metadata=row.to_dict()))
    doc_ids.append(unique_id)

# Batch upsert the documents into the vectorstore.
batch_size = 10000
num_batches = (len(docs) // batch_size) + 1

for batch_idx in tqdm(range(num_batches), desc="Batch Upserting"):
    start = batch_idx * batch_size
    end = start + batch_size
    batch_docs = docs[start:end]
    batch_ids = doc_ids[start:end]
    if batch_docs:
        vectorstore.add_documents(documents=batch_docs, ids=batch_ids)


Processing rows: 100%|██████████| 500/500 [00:00<00:00, 17849.32it/s]
Batch Upserting: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it]


In [None]:
file_path = 'data/insurance/medicare/2022/sample_ny_data.csv'

if not os.path.exists(file_path):
    sample_ny_data = ny_data.sample(n=50000, random_state=42)  # random_state for reproducibility
    sample_ny_data.to_csv(file_path, index=False)
    print(f"Created new sample file: {file_path}")
else:
    # If file exists, read it instead of creating new sample
    sample_ny_data = pd.read_csv(file_path)
    print(f"Loading existing sample file: {file_path} with size: {len(sample_ny_data)}")

Loading existing sample file: data/insurance/medicare/2022/sample_ny_data.csv with size: 50000


In [None]:
from tqdm import tqdm
from langchain.vectorstores import Chroma as LC_Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

# To reuse our Chroma index in LangChain, we can use the same persist_directory.
# Here, we assume you persist the index to a local directory.
# (Alternatively, you could wrap the existing collection—but LangChain’s Chroma class offers a higher-level interface.)
PERSIST_DIR = "./chroma_db"

# Instantiate a Chroma client.
client = chromadb.Client(Settings())

# Create or retrieve a collection with the specified embedding function.
collection = client.get_or_create_collection(
    name="new_york_medicare",
    embedding_function=local_embedding,
)



Processing rows:   8%|▊         | 50000/622553 [00:01<00:22, 25244.61it/s]
Batch Upserting: 100%|██████████| 6/6 [13:16<00:00, 132.73s/it]
Building documents: 100%|██████████| 50000/50000 [00:00<00:00, 174720.88it/s]


AttributeError: 'ONNXMiniLM_L6_V2' object has no attribute 'embed_documents'

In [None]:
# Rebuild the LangChain Chroma vectorstore with the new documents.
vectorstore = LC_Chroma.from_documents(
    documents=docs,
    embedding=local_embedding,
    persist_directory=PERSIST_DIR,
    collection_name="new_york_medicare"
)

In [None]:
### This is more-so a standard ChromaDB approach, but it's useful to know how to interact with the ChromaDB client directly.

# Instantiate a Chroma client.
client = chromadb.Client(Settings())

# Create or retrieve a collection with the specified embedding function.
collection = client.get_or_create_collection(
    name="example_collection",
    embedding_function=default_ef,
)

# Define some example documents along with optional IDs and metawdata.
documents = [
    "Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers.",
    "Natural Language Processing involves the interaction between computers and human language."
]
doc_ids = ["doc1", "doc2", "doc3"]
metadatas = [
    {"category": "AI"},
    {"category": "ML"},
    {"category": "NLP"}
]

# Add the documents to the collection. The embedding function automatically creates embeddings.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=doc_ids
)

# Define a query to search for relevant documents.
query_text = "What is deep learning?"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # Number of top results to return.
)

# Print out the query results.
print("Query Results:")
print(results)

Query Results:
{'ids': [['doc2', 'doc1']], 'embeddings': None, 'documents': [['Deep learning is a subset of machine learning that uses neural networks with many layers.', 'Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.']], 'uris': None, 'data': None, 'metadatas': [[{'category': 'ML'}, {'category': 'AI'}]], 'distances': [[0.3444952964782715, 0.961942732334137]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [None]:
import getpass
import os
from langchain_ollama import ChatOllama
# from langchain.llms import LlamaCpp
from langchain_openai import ChatOpenAI

# Create a retriever (adjust top_k as needed)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

MODEL_CHOICE = "deepseek"  # Change to "lightweight" or "openai" as desired

# Below we provide example setups for each option.
if MODEL_CHOICE == "deepseek":
    llm = ChatOllama(model="deepseek-r1", temperature=0.0)
elif MODEL_CHOICE == "llama2":
    llm = ChatOllama(model="llama2", temperature=0.0)
elif MODEL_CHOICE == "openai":
    if not os.environ.get("OPENAI_API_KEY"):
        os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    
    llm = ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        # api_key="...",  # if you prefer to pass api key in directly instaed of using env vars
        # base_url="...",
        # organization="...",
        # other params...
    )
else:
    raise ValueError("Unsupported MODEL_CHOICE. Choose from 'deepseek', 'llama2', or 'openai'.")


# We use LangChain's RetrievalQA chain, which automatically retrieves relevant document chunks
# and stuffs them into a prompt for the LLM.
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" simply concatenates the retrieved documents; for long contexts consider "map_reduce"
    retriever=retriever
)


Query: What services did Filamer Kabigting provide?
Answer: <think>
Okay, so I need to figure out what services Filamer Kabigting provided based on the context given. Let me start by reading through all the information carefully.

Looking at each provider and their services:

1. James Kalchbrenner: Offers therapy using exercise for strength, endurance, range of motion, and flexibility, each 15 minutes in Scarsdale, NY.
2. Raymond Kruk: Provides an annual wellness visit with a personalized prevention plan (pps) followed by another visit, located in Wappingers Falls, NY.
3. Mark Krasner: Offers therapy using functional activities in Brooklyn, NY.
4. Arthur Kornblit: Similar to Raymond, offers an annual wellness visit with pps and subsequent visit in Baldwin, NY.
5. Robert Krinsky: Provides office or outpatient visits for evaluating and managing established patients without needing a healthcare professional present, located in Woodmere, NY.
6. Gemma Kaunert: Offers chronic care management

In [None]:
query = "What are some physician services that provider White,Devon of 2015 Grand Concourse offers his patients?"
answer = qa_chain.run(query)
print("Query:", query)
print("Answer:", answer)
# Retrieve and print the top relevant documents for the query.
retrieved_docs = retriever.get_relevant_documents(query)
print("Retrieved contexts:")
for idx, doc in enumerate(retrieved_docs):
    print(f"Document {idx + 1}:")
    print(doc.page_content)
    print("-" * 50)


Query: What are some physician services that provider White,Devon of 2015 Grand Concourse offers his patients?
Answer: <think>
Okay, so I need to figure out what services Provider White, Devon of 2015 Grand Concourse offers his patients based on the context provided. Let me start by looking through each piece of data given.

First, there are several entries from Devon White. Each one mentions a service and sometimes a location in the Bronx or Cooperstown, NY. The services vary: some say "Established patient office or other outpatient visit" with different time ranges like 10-19 minutes, 30-39 minutes, etc., and another entry is an emergency department visit for high severity.

Looking at the other providers, Robert Christopher offers an emergency service in Grand Island, NY. Brian White also has an established patient office or outpatient visit taking 30-39 minutes in Cooperstown. Michael White does the same but in Geneva. Pascale White mentions a new patient office with a longer time 

In [6]:
# import time
# from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2

# ef = ONNXMiniLM_L6_V2(preferred_providers=['CUDAExecutionProvider'])

# docs = []
# for i in range(1000):
#     docs.append(f"this is a document with id {i}")

# start_time = time.perf_counter()
# embeddings = ef(docs)
# end_time = time.perf_counter()
# print(f"Elapsed time: {end_time - start_time} seconds")

In [19]:
import onnxruntime

print(onnxruntime.get_available_providers())

['AzureExecutionProvider', 'CPUExecutionProvider']
