In [1]:
# Load libraries
import os
import chromadb
from dotenv import load_dotenv
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from scripts.data_loader import load_medicare_data
# If you plan to use a HuggingFace local model, import the relevant embedding function.
# from chromadb.utils.embedding_functions import HuggingFaceEmbeddingFunction

In [2]:
# Environment variables

# Load environment variables from the secrets.env file.
load_dotenv("secrets.env")

# Retrieve API keys from environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Load dataset
data = load_medicare_data().head(1000)

In [7]:
# Embedding Function

# # Choose embedding function.
# embedding_function = OpenAIEmbeddingFunction(
#     api_key=openai_api_key,  # Uses the API key from secrets.env
#     model_name="text-embedding-ada-002"  # You can change this to any supported model.
# )

# local
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L12-v2")

from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()


In [8]:
# Instantiate a Chroma client.
client = chromadb.Client(Settings())

# Create or retrieve a collection with the specified embedding function.
collection = client.get_or_create_collection(
    name="example_collection",
    embedding_function=default_ef
)

# Define some example documents along with optional IDs and metadata.
documents = [
    "Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers.",
    "Natural Language Processing involves the interaction between computers and human language."
]
doc_ids = ["doc1", "doc2", "doc3"]
metadatas = [
    {"category": "AI"},
    {"category": "ML"},
    {"category": "NLP"}
]

# Add the documents to the collection. The embedding function automatically creates embeddings.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=doc_ids
)

# Define a query to search for relevant documents.
query_text = "What is deep learning?"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # Number of top results to return.
)

# Print out the query results.
print("Query Results:")
print(results)

C:\Users\educa\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 40.9MiB/s]


Query Results:
{'ids': [['doc2', 'doc1']], 'embeddings': None, 'documents': [['Deep learning is a subset of machine learning that uses neural networks with many layers.', 'Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.']], 'uris': None, 'data': None, 'metadatas': [[{'category': 'ML'}, {'category': 'AI'}]], 'distances': [[0.3444952964782715, 0.961942732334137]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
