## Chromadb

In [18]:
%pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [19]:
import chromadb

client = chromadb.Client()

collection = client.create_collection("test")

Using embedded DuckDB without persistence: data will be transient
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [20]:
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"uri": "img1.png", "style": "style1"},
        {"uri": "img2.png", "style": "style2"},
        {"uri": "img3.png", "style": "style1"},
        {"uri": "img4.png", "style": "style1"},
        {"uri": "img5.png", "style": "style1"},
        {"uri": "img6.png", "style": "style1"},
        {"uri": "img7.png", "style": "style1"},
        {"uri": "img8.png", "style": "style1"},
    ],
    documents=["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

query_result = collection.query(
        query_embeddings=[[1.1, 2.3, 3.2], [5.1, 4.3, 2.2]],
        n_results=2,
    )

query_result


{'ids': [['id1', 'id5'], ['id2', 'id4']],
 'embeddings': None,
 'documents': [['doc1', 'doc5'], ['doc2', 'doc4']],
 'metadatas': [[{'uri': 'img1.png', 'style': 'style1'},
   {'uri': 'img5.png', 'style': 'style1'}],
  [{'uri': 'img2.png', 'style': 'style2'},
   {'uri': 'img4.png', 'style': 'style1'}]],
 'distances': [[0.0, 0.0], [11.959999084472656, 11.959999084472656]]}

In [21]:
import chromadb
chroma_client = chromadb.Client()

Using embedded DuckDB without persistence: data will be transient


In [22]:
collection = chroma_client.create_collection(name="my_collection")

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [23]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

In [24]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=1
)
results

{'ids': [['id1']],
 'embeddings': None,
 'documents': [['This is a document']],
 'metadatas': [[{'source': 'my_source'}]],
 'distances': [[0.7111217379570007]]}

In [25]:
import chromadb

from chromadb.config import Settings
client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory=".chromadb" # Optional, defaults to .chromadb/ in the current directory
))


Using embedded DuckDB with persistence: data will be stored in: .chromadb


In [26]:
client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

True

In [27]:
collection = client.create_collection(name="my_collection")

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [28]:
collection.add(
    documents=["lorem ipsum...", "doc2", "doc3"],
    metadatas=[
        {"chapter": "3", "verse": "16"},
        {"chapter": "3", "verse": "5"}, 
        {"chapter": "29", "verse": "11"}],
    ids=["id1", "id2", "id3"]
)



In [29]:
collection.query(
    query_texts=["This is a query document"],
    n_results=1
)


{'ids': [['id3']],
 'embeddings': None,
 'documents': [['doc3']],
 'metadatas': [[{'chapter': '29', 'verse': '11'}]],
 'distances': [[1.2974470853805542]]}

In [30]:
import chromadb

from chromadb.config import Settings

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory=".chromadb" # Optional, defaults to .chromadb/ in the current directory
))

ollection = client.create_collection(name="my_collection")

collection.query(
    query_texts=["This is a query document"],
    n_results=1
)


Using embedded DuckDB with persistence: data will be stored in: .chromadb
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


{'ids': [['id3']],
 'embeddings': None,
 'documents': [['doc3']],
 'metadatas': [[{'chapter': '29', 'verse': '11'}]],
 'distances': [[1.2974470853805542]]}

In [38]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="localhost",
                                  chroma_server_http_port="8000"))



In [40]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)


In [41]:
collection = client.get_collection(name="my_collection")

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [42]:
collection.count()

2

In [47]:
collection.get(
    include=["documents"]
)


{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This is a document', 'This is another document'],
 'metadatas': None}