In [2]:
import chromadb

Chroma Client

In [3]:
client = chromadb.Client()

In [4]:
collection = client.create_collection(name="news")
collection

Collection(name=news)

In [5]:
collection.add(
    documents=[
        "Apple reported its quarterly earnings today",
        "Apple has a lot of vitamin A"
    ],
    ids=["id1", "id2"]
)

/Users/swathiradhakrishnan/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:14<00:00, 5.71MiB/s]


Quering Collection

In [6]:
collection.query(
    query_texts=["New iphone will launch in september"],
    n_results= 2
)

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['Apple reported its quarterly earnings today',
   'Apple has a lot of vitamin A']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.4136261940002441, 1.7049864530563354]]}

In [7]:
collection.query(
    query_texts=["Doctor suggested my mother to eat citric fruits"],
    n_results=2
)

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['Apple has a lot of vitamin A',
   'Apple reported its quarterly earnings today']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.3456089496612549, 1.7572938203811646]]}

In [9]:
collection.peek()['embeddings'][0]

array([ 3.43889669e-02,  4.53302488e-02,  3.35943699e-02,  3.27745005e-02,
       -1.51062908e-03, -5.00667058e-02,  1.25939725e-02, -4.37261127e-02,
        4.92361709e-02,  4.62551340e-02,  6.32305741e-02,  2.45131515e-02,
       -4.13523689e-02,  1.53688043e-02, -1.27223106e-02, -1.66666750e-02,
       -9.36637912e-03, -4.49326150e-02,  1.57926995e-02, -1.62688289e-02,
       -4.21716422e-02,  4.30162717e-03,  6.12715185e-02,  1.32648405e-02,
        1.54016256e-01,  4.45452258e-02, -7.05872988e-03,  1.30078550e-02,
       -3.38342972e-02, -4.22298610e-02, -1.04623906e-01,  1.58267934e-02,
        8.00960362e-02,  2.73248460e-02,  9.12597054e-04, -7.03276172e-02,
        4.54225987e-02, -3.57186049e-02,  2.72022141e-03, -1.96272656e-02,
       -4.00674120e-02,  1.31088505e-02,  1.54401241e-02,  6.17499985e-02,
       -7.88003877e-02, -3.99145000e-02,  8.68621934e-03,  1.88715551e-02,
        2.48146299e-02,  1.28597453e-01,  1.90437119e-03, -4.67963777e-02,
        7.34508559e-02, -

Persistent Client

In [10]:
client_p = chromadb.PersistentClient(path='./news_db')

Embedding functions (Default: all-MiniLM-L6-v2)

In [11]:
from chromadb.utils import embedding_functions
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-distilroberta-v1"
)
collection = client_p.create_collection(name='news_vector', embedding_function=ef)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
collection.add(
    documents=[
        "Apple reported its quarterly earnings today",
        "Apple has a lot of vitamin A"
    ],
    ids=["id1", "id2"]
)

In [18]:
collection.query(
    query_texts=["New iphone will launch in september"],
    n_results= 2
)

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['Apple reported its quarterly earnings today',
   'Apple has a lot of vitamin A']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.4540897607803345, 1.6653056144714355]]}

In [19]:
client_p.heartbeat()

1749031194073652000

Add, Update, Delete

In [20]:
client_r = chromadb.Client()

In [21]:
from datetime import datetime
collection = client_r.create_collection(name="product_reviews", metadata={"description":"Reviews of products", "created": str(datetime.now())})

In [22]:
documents = [
    "This phone has an amazing battery life.",
    "The screen quality is poor and not worth the price.",
    "Excellent sound clarity and build quality.",
    "Battery drains quickly, not suitable for travel.",
    "Highly durable and water-resistant smartwatch.",
    "The audio is muffled and not loud enough.",
    "Camera quality exceeds expectations in low light.",
    "Display is vibrant, great for watching movies.",
]

metadatas = [
    {"product_id": "P123", "category": "Electronics", "rating": 5},
    {"product_id": "P123", "category": "Electronics", "rating": 2},
    {"product_id": "P456", "category": "Audio", "rating": 4},
    {"product_id": "P123", "category": "Electronics", "rating": 1},
    {"product_id": "P789", "category": "Wearables", "rating": 5},
    {"product_id": "P456", "category": "Audio", "rating": 2},
    {"product_id": "P321", "category": "Cameras", "rating": 4},
    {"product_id": "P123", "category": "Electronics", "rating": 4},
]

ids = [
    "doc1", "doc2", "doc3", "doc4",
    "doc5", "doc6", "doc7", "doc8"
]

collection.add(ids=ids, metadatas=metadatas, documents=documents)

In [27]:
collection.query(query_texts="sound quality", n_results=3)

{'ids': [['doc3', 'doc6', 'doc2']],
 'embeddings': None,
 'documents': [['Excellent sound clarity and build quality.',
   'The audio is muffled and not loud enough.',
   'The screen quality is poor and not worth the price.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'rating': 4, 'product_id': 'P456', 'category': 'Audio'},
   {'rating': 2, 'category': 'Audio', 'product_id': 'P456'},
   {'category': 'Electronics', 'product_id': 'P123', 'rating': 2}]],
 'distances': [[0.5910329222679138, 1.016019582748413, 1.1162805557250977]]}

In [28]:
collection.query(query_texts="sound quality", n_results=3, where={"category":"Audio"})

{'ids': [['doc3', 'doc6']],
 'embeddings': None,
 'documents': [['Excellent sound clarity and build quality.',
   'The audio is muffled and not loud enough.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'rating': 4, 'product_id': 'P456', 'category': 'Audio'},
   {'rating': 2, 'product_id': 'P456', 'category': 'Audio'}]],
 'distances': [[0.5910329222679138, 1.016019582748413]]}

In [33]:
collection.update(documents=['There is a setting to be done for the louder audio output. Sound quality is great'],
                  ids=['doc6'],
                  metadatas=[{"category":"Audio","rating":4}])

In [34]:
collection.peek()

{'ids': ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8'],
 'embeddings': array([[-0.0665448 ,  0.0415802 ,  0.02805798, ...,  0.00156581,
         -0.0013664 ,  0.04444058],
        [-0.03157043,  0.00038221,  0.02809887, ..., -0.02083447,
          0.01370098,  0.09151227],
        [-0.04998122, -0.00628815,  0.0270207 , ...,  0.07026031,
         -0.01461971,  0.05089299],
        ...,
        [ 0.06710169, -0.10688053, -0.04933071, ...,  0.08523181,
          0.045284  ,  0.05633426],
        [ 0.03185945,  0.0110309 ,  0.04394806, ..., -0.06210862,
         -0.06594026,  0.09912495],
        [ 0.01330459, -0.0233338 , -0.02343411, ...,  0.0179272 ,
          0.01875084,  0.08142737]], shape=(8, 384)),
 'documents': ['This phone has an amazing battery life.',
  'The screen quality is poor and not worth the price.',
  'Excellent sound clarity and build quality.',
  'Battery drains quickly, not suitable for travel.',
  'Highly durable and water-resistant smartwatch.',


In [35]:
collection.get(ids='doc6')

{'ids': ['doc6'],
 'embeddings': None,
 'documents': ['There is a setting to be done for the louder audio output. Sound quality is great'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'product_id': 'P456', 'category': 'Audio', 'rating': 4}]}

In [36]:
collection.delete(ids=['doc3','doc4'])

In [37]:
collection.peek()

{'ids': ['doc1', 'doc2', 'doc5', 'doc6', 'doc7', 'doc8'],
 'embeddings': array([[-0.0665448 ,  0.0415802 ,  0.02805798, ...,  0.00156581,
         -0.0013664 ,  0.04444058],
        [-0.03157043,  0.00038221,  0.02809887, ..., -0.02083447,
          0.01370098,  0.09151227],
        [-0.10404313,  0.06058613,  0.11677103, ..., -0.09714488,
          0.0675921 ,  0.14357704],
        [ 0.06710169, -0.10688053, -0.04933071, ...,  0.08523181,
          0.045284  ,  0.05633426],
        [ 0.03185945,  0.0110309 ,  0.04394806, ..., -0.06210862,
         -0.06594026,  0.09912495],
        [ 0.01330459, -0.0233338 , -0.02343411, ...,  0.0179272 ,
          0.01875084,  0.08142737]], shape=(6, 384)),
 'documents': ['This phone has an amazing battery life.',
  'The screen quality is poor and not worth the price.',
  'Highly durable and water-resistant smartwatch.',
  'There is a setting to be done for the louder audio output. Sound quality is great',
  'Camera quality exceeds expectations in lo

In [38]:
client.delete_collection(name='product_reviews')

In [40]:
# client.get_collection('product_reviews')

Metadata filtering

In [41]:
collection = client_r.create_collection(name="product_reviews", metadata={"description":"Reviews of products", "created": str(datetime.now())})
collection.add(ids=ids, metadatas=metadatas, documents=documents)

In [43]:
collection.query(
    query_texts=['sound quality'],
    n_results=1,
    where= {'rating': 
            {
                '$in': [1,2,3]
            }}
)

{'ids': [['doc6']],
 'embeddings': None,
 'documents': [['The audio is muffled and not loud enough.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'product_id': 'P456', 'category': 'Audio', 'rating': 2}]],
 'distances': [[1.016019582748413]]}

In [44]:
collection.query(
    query_texts=['sound quality'],
    n_results=1,
    where= {'rating': 
            {
                '$nin': [1,2,3]
            }}
)

{'ids': [['doc3']],
 'embeddings': None,
 'documents': [['Excellent sound clarity and build quality.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'product_id': 'P456', 'rating': 4, 'category': 'Audio'}]],
 'distances': [[0.5910329222679138]]}

In [46]:
collection.query(
    query_texts=['sound quality'],
    n_results=2,
    where= {'rating': 
            {
                '$gt': 3
            }}
)

{'ids': [['doc3', 'doc7']],
 'embeddings': None,
 'documents': [['Excellent sound clarity and build quality.',
   'Camera quality exceeds expectations in low light.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'rating': 4, 'product_id': 'P456', 'category': 'Audio'},
   {'rating': 4, 'product_id': 'P321', 'category': 'Cameras'}]],
 'distances': [[0.5910329222679138, 1.339406132698059]]}

Full text search

In [47]:
collection.query(
    query_texts=['sound quality'],
    n_results=1,
    where_document= 
            {
                '$contains': 'quality'
            }
)

{'ids': [['doc3']],
 'embeddings': None,
 'documents': [['Excellent sound clarity and build quality.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'category': 'Audio', 'product_id': 'P456', 'rating': 4}]],
 'distances': [[0.5910329222679138]]}

In [51]:
collection.get(include=['embeddings'])

{'ids': ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8'],
 'embeddings': array([[-0.0665448 ,  0.0415802 ,  0.02805798, ...,  0.00156581,
         -0.0013664 ,  0.04444058],
        [-0.03157043,  0.00038221,  0.02809887, ..., -0.02083447,
          0.01370098,  0.09151227],
        [-0.04998122, -0.00628815,  0.0270207 , ...,  0.07026031,
         -0.01461971,  0.05089299],
        ...,
        [ 0.03927212, -0.04580301, -0.01739653, ...,  0.04727325,
          0.02423911,  0.08448103],
        [ 0.03185945,  0.0110309 ,  0.04394806, ..., -0.06210862,
         -0.06594026,  0.09912495],
        [ 0.01330459, -0.0233338 , -0.02343411, ...,  0.0179272 ,
          0.01875084,  0.08142737]], shape=(8, 384)),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}