In [4]:
# Load libraries
import os
import chromadb
from dotenv import load_dotenv
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from scripts.data_loader import load_medicare_data
# If you plan to use a HuggingFace local model, import the relevant embedding function.
# from chromadb.utils.embedding_functions import HuggingFaceEmbeddingFunction

In [5]:
# Environment variables

# Load environment variables from the secrets.env file.
load_dotenv("secrets.env")

# Retrieve API keys from environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Load dataset
data = load_medicare_data().head(1000)

In [7]:
len(data)
data

Unnamed: 0,Rndrng_NPI,Rndrng_Prvdr_Last_Org_Name,Rndrng_Prvdr_First_Name,Rndrng_Prvdr_MI,Rndrng_Prvdr_Crdntls,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Ent_Cd,Rndrng_Prvdr_St1,Rndrng_Prvdr_St2,Rndrng_Prvdr_City,...,HCPCS_Desc,HCPCS_Drug_Ind,Place_Of_Srvc,Tot_Benes,Tot_Srvcs,Tot_Bene_Day_Srvcs,Avg_Sbmtd_Chrg,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Stdzd_Amt
0,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,6410 Rockledge Dr Ste 304,,Bethesda,...,Hospital observation care on day of discharge,N,F,42,44,44,288.93477273,76.932045455,58.619772727,53.307954545
1,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,6410 Rockledge Dr Ste 304,,Bethesda,...,"Initial hospital observation care per day, typ...",N,F,17,17,17,424.80411765,144.92,109.15529412,97.278823529
2,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,6410 Rockledge Dr Ste 304,,Bethesda,...,"Initial hospital observation care per day, typ...",N,F,35,35,35,686.56428571,189.99885714,151.59685714,140.73314286
3,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,6410 Rockledge Dr Ste 304,,Bethesda,...,"Initial hospital inpatient care per day, typic...",N,F,16,16,16,894.99125,100.009375,79.264375,78.499375
4,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,6410 Rockledge Dr Ste 304,,Bethesda,...,"Initial hospital inpatient care per day, typic...",N,F,12,12,12,511.915,144.20166667,112.94666667,103.72416667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1003008095,House,Robin,R,APRN NP,F,I,1406 W 5th St,Suite 201,London,...,"Injection, vitamin b-12 cyanocobalamin, up to ...",Y,O,51,96,96,4,1.6235416667,1.0775,1.1276041667
996,1003008111,Easterlin,Marie,O,M.D.,F,I,418 Eh Ct,Unit 4b,Brunswick,...,"Urinalysis, manual test",N,O,12,12,12,10,3.425,3.425,3.41
997,1003008111,Easterlin,Marie,O,M.D.,F,I,418 Eh Ct,Unit 4b,Brunswick,...,"New patient office or other outpatient visit, ...",N,O,34,34,34,271,106.71617647,70.672647059,75.081470588
998,1003008111,Easterlin,Marie,O,M.D.,F,I,418 Eh Ct,Unit 4b,Brunswick,...,Established patient office or other outpatient...,N,O,34,40,40,183,86.22075,63.75675,67.17275


In [3]:
# Embedding Function

# # Choose embedding function.
# embedding_function = OpenAIEmbeddingFunction(
#     api_key=openai_api_key,  # Uses the API key from secrets.env
#     model_name="text-embedding-ada-002"  # You can change this to any supported model.
# )

# local
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L12-v2")

from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()


In [4]:
# Instantiate a Chroma client.
client = chromadb.Client(Settings())

# Create or retrieve a collection with the specified embedding function.
collection = client.get_or_create_collection(
    name="example_collection",
    embedding_function=default_ef
)

# Define some example documents along with optional IDs and metadata.
documents = [
    "Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers.",
    "Natural Language Processing involves the interaction between computers and human language."
]
doc_ids = ["doc1", "doc2", "doc3"]
metadatas = [
    {"category": "AI"},
    {"category": "ML"},
    {"category": "NLP"}
]

# Add the documents to the collection. The embedding function automatically creates embeddings.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=doc_ids
)

# Define a query to search for relevant documents.
query_text = "What is deep learning?"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # Number of top results to return.
)

# Print out the query results.
print("Query Results:")
print(results)

Query Results:
{'ids': [['doc2', 'doc1']], 'embeddings': None, 'documents': [['Deep learning is a subset of machine learning that uses neural networks with many layers.', 'Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.']], 'uris': None, 'data': None, 'metadatas': [[{'category': 'ML'}, {'category': 'AI'}]], 'distances': [[0.3444952964782715, 0.961942732334137]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [5]:

# Sample data (you can replace this with your own documents)
data = [
    {"id": "1", "text": "Natural language processing is revolutionizing how we interact with computers."},
    {"id": "2", "text": "Machine learning algorithms can identify patterns in large datasets."},
    {"id": "3", "text": "Deep learning models have achieved remarkable results in computer vision."},
    {"id": "4", "text": "Vector embeddings help computers understand semantic relationships between words."},
    {"id": "5", "text": "Artificial intelligence is transforming industries across the globe."},
]


# Upsert the records into a namespace (e.g., "ns1").
collection.upsert(
    documents=[record["text"] for record in data],
    metadatas=[{"id": record["id"]} for record in data],
    ids=[record["id"] for record in data]
)


In [None]:
from langchain.vectorstores import Chroma as LC_Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

# To reuse our Chroma index in LangChain, we can use the same persist_directory.
# Here, we assume you persist the index to a local directory.
# (Alternatively, you could wrap the existing collection—but LangChain’s Chroma class offers a higher-level interface.)
PERSIST_DIR = "./chroma_db"
local_embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

docs = [
    Document(page_content=record["text"], metadata={"id": record["id"]})
    for record in data
]

vectorstore = LC_Chroma.from_documents(
    documents=docs,
    embedding=local_embedding,
    persist_directory=PERSIST_DIR,
    collection_name="example_collection"
)

  local_embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import getpass
import os
from langchain_ollama import ChatOllama
# from langchain.llms import LlamaCpp
from langchain_openai import ChatOpenAI

# Create a retriever (adjust top_k as needed)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

MODEL_CHOICE = "openai"  # Change to "lightweight" or "openai" as desired

# Below we provide example setups for each option.
if MODEL_CHOICE == "deepseek":
    # Example using a GPU-based model (e.g., DeepSeek)
    llm = ChatOllama(model="deepseek-r1", temperature=0.0)
elif MODEL_CHOICE == "llama2":
    llm = ChatOllama(model="llama2", temperature=0.0)
elif MODEL_CHOICE == "openai":
    if not os.environ.get("OPENAI_API_KEY"):
        os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    
    llm = ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        # api_key="...",  # if you prefer to pass api key in directly instaed of using env vars
        # base_url="...",
        # organization="...",
        # other params...
    )
else:
    raise ValueError("Unsupported MODEL_CHOICE. Choose from 'deepseek', 'lightweight', or 'openai'.")


# We use LangChain's RetrievalQA chain, which automatically retrieves relevant document chunks
# and stuffs them into a prompt for the LLM.
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" simply concatenates the retrieved documents; for long contexts consider "map_reduce"
    retriever=retriever
)


if __name__ == "__main__":
    query = "What is deep learning and semantic search?"
    answer = qa_chain.invoke(query)
    print("Query:", query)
    print("Answer:", answer)


Query: What is deep learning and semantic search?
Answer: {'query': 'What is deep learning and semantic search?', 'result': 'Deep learning is a subset of machine learning that involves neural networks with many layers (hence "deep") to model complex patterns in data. It has achieved remarkable results in fields like computer vision, natural language processing, and speech recognition by automatically learning representations from large amounts of data.\n\nSemantic search, on the other hand, refers to search techniques that aim to improve search accuracy by understanding the meaning and context of search queries rather than relying solely on keyword matching. It often involves the use of vector embeddings, which help computers understand semantic relationships between words, allowing for more relevant and context-aware search results.'}
