##### Datasets
1. Insurance
2. Health
3. Legal
4. Finance

#### ChromaDB

In [2]:
import os
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
# If you plan to use a HuggingFace local model, import the relevant embedding function.
# from chromadb.utils.embedding_functions import HuggingFaceEmbeddingFunction

# Load environment variables from the secrets.env file.
load_dotenv("secrets.env")

# Retrieve API keys from environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Choose your embedding function.
# In this example, we default to OpenAI. If you want to use a HuggingFace model,
# you could add logic here to choose based on a configuration variable.
embedding_function = OpenAIEmbeddingFunction(
    api_key=openai_api_key,  # Uses the API key from secrets.env
    model_name="text-embedding-ada-002"  # You can change this to any supported model.
)

# Instantiate a Chroma client.
client = chromadb.Client(Settings(
    chroma_api_impl="rest",
    chroma_server_host="localhost",
    chroma_server_http_port="8000"
))

# Create or retrieve a collection with the specified embedding function.
collection = client.get_or_create_collection(
    name="example_collection",
    embedding_function=embedding_function
)

# Define some example documents along with optional IDs and metadata.
documents = [
    "Machine learning is a field of artificial intelligence that uses statistical techniques to give computers the ability to learn.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers.",
    "Natural Language Processing involves the interaction between computers and human language."
]
doc_ids = ["doc1", "doc2", "doc3"]
metadatas = [
    {"category": "AI"},
    {"category": "ML"},
    {"category": "NLP"}
]

# Add the documents to the collection. The embedding function automatically creates embeddings.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=doc_ids
)

# Define a query to search for relevant documents.
query_text = "What is deep learning?"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # Number of top results to return.
)

# Print out the query results.
print("Query Results:")
print(results)

ValueError: Unsupported Chroma API implementation rest

##### PineconeDB

In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# Load environment variables from the secrets.env file.
load_dotenv("secrets.env")

# Retrieve API keys from environment variables.
# openai_api_key = os.getenv("OPENAI_API_KEY")
# huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(pinecone_index_name)

In [None]:
index.upsert(
    vectors=[
        {
            "id": "vec1", 
            "values": [1.0, 1.5], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec2",
            "values": [2.0, 1.0], 
            "metadata": {"genre": "action"}
        }, {
            "id": "vec3",
            "values": [0.1, 0.3], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec4", 
            "values": [1.0, -2.5], 
            "metadata": {"genre": "action"}
        }
    ],
    namespace= "ns1"
)

In [None]:
response = index.query(
    namespace="ns1",
    vector=[0.1, 0.3],
    top_k=2,
    include_values=True,
    include_metadata=True,
    filter={"genre": {"$eq": "action"}}
)
    
print(response)

#### PGVector

In [None]:
import psycopg
from pgvector.psycopg import register_vector
from sentence_transformers import SentenceTransformer

conn = psycopg.connect(dbname='pgvector_example', autocommit=True)

conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)

conn.execute('DROP TABLE IF EXISTS documents')
conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))')

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

input = [
    'The dog is barking',
    'The cat is purring',
    'The bear is growling'
]
embeddings = model.encode(input)
for content, embedding in zip(input, embeddings):
    conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding))

query = 'forest'
query_embedding = model.encode(query)
result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (query_embedding,)).fetchall()
for row in result:
    print(row[0])

#### Azure CosmoDB

#### Azure Databricks