### ChromaDB

In [1]:
import chromadb

#### Embeddings function

In [2]:
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [3]:
default_ef

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x7f4a606b0ac0>

In [4]:
len(default_ef(["foo"])[0])

384

#### ChromaDB Clients

In [6]:
# conenctando a um servidor
client = chromadb.HttpClient(host='127.0.0.1', port=8000)
# conectando a um db em arquivo
# client = chromadb.PersistentClient(path="./db")

In [7]:
# testando a conexão
client.heartbeat()

1719577482752217737

#### Collections

In [9]:
collection = client.get_or_create_collection(name="my_collection")

In [10]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id1", "id2"]
)

In [11]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

{'ids': [['id1', 'id3']], 'distances': [[1.5565170243896997, 1.5629557393651845]], 'embeddings': None, 'metadatas': [[{'chapter': '3', 'verse': '16'}, {'chapter': '29', 'verse': '11'}]], 'documents': [['doc1', 'doc3']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [12]:
collection.peek() # returns a list of the first 10 items in the collection

{'ids': ['id1', 'id2', 'id3'],
 'embeddings': [[-0.08946740627288818,
   0.026901664212346077,
   0.0714292973279953,
   0.0018527907086536288,
   0.028905678540468216,
   -0.07897716760635376,
   0.056619081646203995,
   0.1030322015285492,
   0.0002788240963127464,
   -0.007082213182002306,
   -0.026516541838645935,
   0.00714935502037406,
   0.029475053772330284,
   0.004517349414527416,
   -0.04392479360103607,
   0.0019748948980122805,
   -0.07796674966812134,
   -0.030047820881009102,
   -0.03155547007918358,
   0.06856383383274078,
   -0.07460123300552368,
   0.12591703236103058,
   -0.03232401981949806,
   -0.014193093404173851,
   0.017825551331043243,
   -0.008171051740646362,
   -0.09954645484685898,
   -0.0004128795408178121,
   -0.03543565049767494,
   -0.09052635729312897,
   0.05686790868639946,
   0.037192098796367645,
   0.03478348255157471,
   0.029278822243213654,
   0.002904640743508935,
   0.002846348797902465,
   0.049111392349004745,
   0.007007570005953312,
   -

In [13]:
collection.count() # returns the number of items in the collection

3

In [14]:
collection.modify(name="my_collection2") # Rename the collection

In [15]:
collection = client.create_collection(
        name="my_collection",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

In [16]:
collection.add(
    documents=["doc1", "doc2", "doc3"],
    embeddings=default_ef(["doc1", "doc2", "doc3"]),
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    ids=["id1", "id2", "id3"]
)

In [28]:
collection.query(
    query_texts=["doc1"],
    n_results=1,
    #query_embeddings=default_ef(["doc1"]),
    #where={"verse": "16"}, # pesquisa nos metadatas
    #where_document={"$contains":"doc2"} # procura nos documentos
)

{'ids': [['id1']],
 'distances': [[-2.220446049250313e-16]],
 'embeddings': None,
 'metadatas': [[{'chapter': '3', 'verse': '16'}]],
 'documents': [['doc1']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [45]:
collections = client.list_collections()

for c in collections:
    print("-----------------")
    print("id:", c.id)
    print("name:", c.name)
    print("db:", c.database)
    print("tenant:", c.tenant)
    print("model:", c.get_model())
    print("-----------------")
    print('\n')


-----------------
id: 122d5ca2-1ca7-460c-962e-1a79665b7750
name: my_collection2
db: default_database
tenant: default_tenant
model: {'id': '122d5ca2-1ca7-460c-962e-1a79665b7750', 'name': 'my_collection2', 'metadata': None, 'dimension': 384, 'tenant': 'default_tenant', 'database': 'default_database', 'version': 0}
-----------------


-----------------
id: 6319326c-98e3-429b-b104-de21ec077fb1
name: my_collection
db: default_database
tenant: default_tenant
model: {'id': '6319326c-98e3-429b-b104-de21ec077fb1', 'name': 'my_collection', 'metadata': {'hnsw:space': 'cosine'}, 'dimension': 384, 'tenant': 'default_tenant', 'database': 'default_database', 'version': 0}
-----------------




In [47]:
client.delete_collection(name = "my_collection")
client.delete_collection(name = "my_collection2")

In [46]:
# para usar isso tem que setar allow_reset p/ true em settings
# client.reset()

#### (Opcional) Semantic Search Models

In [49]:
# VAI BAIXAR VÁRIAS DEPENDENCIA (torch, nvidia, ...)
# não esta disponível no conda-forge e default channels
!pip install sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence_transformers)
  Downloading transformers-4.42.2-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m166.2 kB/s[0m eta [36m0:00:00[0m1m397.1 kB/s[0m eta [36m0:00:01[0m
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.3.1-cp39-cp39-manylinux1_x86_64.whl.metadata (26 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m274.2 kB/s[0m eta [36m0:00:00[0m31m2.3 MB/s[0m

In [63]:
# https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [66]:
# ver https://docs.trychroma.com/guides/embeddings

query_embedding = sentence_transformer_ef._model.encode("How big is London")
passage_embeddings = sentence_transformer_ef._model.encode([
    "London is known for its finacial district",
    "London has 9,787,426 inhabitants at the 2011 census",
    "The United Kingdom is the fourth largest exporter of goods in the world",
])

similarity = sentence_transformer_ef._model.similarity(query_embedding, passage_embeddings)
# => tensor([[0.4659, 0.6142, 0.2697]])
similarity

tensor([[0.4823, 0.6286, 0.2375]])

In [71]:
# TODO entender o método embed_with_retries e se está correto trabalhar dessa forma p/ fazer busca semantica
query_embedding = sentence_transformer_ef.embed_with_retries("How big is London")
passage_embeddings = sentence_transformer_ef.embed_with_retries([
    "London is known for its finacial district",
    "London has 9,787,426 inhabitants at the 2011 census",
    "The United Kingdom is the fourth largest exporter of goods in the world",
])

similarity = sentence_transformer_ef._model.similarity(query_embedding, passage_embeddings)
similarity # TODO entender a diferença em relação ao resultado acima

tensor([[-0.0045, -0.0334,  0.0374],
        [-0.0124, -0.0082,  0.0014],
        [-0.0010, -0.0470,  0.1020],
        [-0.0114, -0.0378, -0.0064],
        [-0.0154, -0.0400,  0.0501],
        [-0.0245, -0.0373,  0.0172],
        [ 0.0100, -0.0560,  0.0703],
        [-0.0114, -0.0378, -0.0064],
        [-0.0245, -0.0373,  0.0172],
        [ 0.0575,  0.0319,  0.0429],
        [-0.0114, -0.0378, -0.0064],
        [ 0.1327,  0.1027,  0.0205],
        [-0.0124, -0.0082,  0.0014],
        [ 0.0038,  0.0446,  0.1053],
        [-0.0101, -0.0673,  0.0584],
        [-0.0124, -0.0082,  0.0014],
        [ 0.0038,  0.0446,  0.1053]])