In [3]:
import chromadb
from chromadb.utils import embedding_functions

In [4]:
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

  from .autonotebook import tqdm as notebook_tqdm


### **Chroma Fields — Short Description, Function & Example**

`ids` → unique identifier (function: locate each record)
example: ["u2", "u4"]

`documents` → stored text (function: used for embeddings & search)
example: ["ingin dekat transportasi umum", "butuh unit baru full furnished"]

`metadatas` (optional) → structured fields (function: enable filtering)
example:
{"city": "Jakarta", "budget": 1200, "name": "Budi"}

`embeddings` → vector numbers (function: similarity matching when requested)
example: None (or a list of floating-point numbers if included)

`uris` → external document location (function: used only if documents stored elsewhere)
example: None

`included` → fields returned in output (function: shows which components are included)
example: ["metadatas", "documents"]

`data` → extra payload (function: rarely used)
example: None

In [5]:
# Create a Collection

client = chromadb.Client()
collection = client.create_collection(
    name="filter_demo",
    metadata={"description": "Used to demo filtering in ChromaDB"},
    configuration={
        "embedding_function": ef
    }
)
print(f"Collection created: {collection.name}")

Collection created: filter_demo


In [6]:
# Adding Documents to Collections

collection.add(
    documents=[
        "This is a document about LangChain",
        "This is a reading about LlamaIndex",
        "This is a book about Python",
        "This is a document about pandas",
        "This is another document about LangChain"
    ],
    metadatas=[
        {"source": "langchain.com", "version": 0.1},
        {"source": "llamaindex.ai", "version": 0.2},
        {"source": "python.org", "version": 0.3},
        {"source": "pandas.pydata.org", "version": 0.4},
        {"source": "langchain.com", "version": 0.5},
    ],
    ids=["id1", "id2", "id3", "id4", "id5"]
)

### **Metadata filtering**

Used to filter records based on structured fields stored in the `metadata` object.  
This works like a SQL `WHERE` clause and supports comparisons such as equality, inequality, greater/less than, and list membership.  
It is ideal for numeric values, categories, or exact text fields (e.g., city, price, status, role).

`$eq` (equal to) → `{"city": {"$eq": "Jakarta"}}`

`$ne` (not equal to) → `{"city": {"$ne": "Jakarta"}}`

`$gt` (greater than) → `{"budget": {"$gt": 1000}}`

`$gte` (greater than or equal to) → `{"budget": {"$gte": 1000}}`

`$lt` (less than) → `{"budget": {"$lt": 800}}`

`$lte` (less than or equal to) → `{"budget": {"$lte": 800}}`

`$in` (value exists in list) → `{"city": {"$in": ["Jakarta", "Bandung"]}}`

`$nin` (value not in list) → `{"city": {"$nin": ["Jakarta", "Bandung"]}}`


In [9]:
# Filter using Metadata

collection.get(
    where={"source": {"$eq": "langchain.com"}}
)

{'ids': ['id1', 'id5'],
 'embeddings': None,
 'documents': ['This is a document about LangChain',
  'This is another document about LangChain'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'langchain.com', 'version': 0.1},
  {'version': 0.5, 'source': 'langchain.com'}]}

In [11]:
collection.get(
    where={"version": {"$lt": 0.3}}
)

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This is a document about LangChain',
  'This is a reading about LlamaIndex'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'langchain.com', 'version': 0.1},
  {'source': 'llamaindex.ai', 'version': 0.2}]}

In [12]:
collection.get(
    where={
        "$and": [
            {"source": {"$eq": "langchain.com"}}, 
            {"version": {"$lt": 0.3}}
        ]
    }
)

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['This is a document about LangChain'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'version': 0.1, 'source': 'langchain.com'}]}

### **Document filtering**

Used to filter records based on the contents of the text stored in the `documents` field.  
This behaves like a full-text search and is designed to check whether a keyword or phrase exists in the document text.  
It supports operators for text matching and logical combination of multiple keyword searches.

`$contains` (text contains keyword) → `{"$contains": "apartemen"}`

`$not_contains` (text does NOT contain keyword) → `{"$not_contains": "furnished"}`

`$and` (combine multiple document filters — all must match) →  
`{"$and": [ {"$contains": "jakarta"}, {"$contains": "murah"} ]}`

`$or` (combine multiple document filters — one or more may match) →  
`{"$or": [ {"$contains": "furnished"}, {"$contains": "dekat kantor"} ]}`

In [13]:
# Filter using Document Content

collection.get(
    where_document={"$contains":"pandas"}
)

{'ids': ['id4'],
 'embeddings': None,
 'documents': ['This is a document about pandas'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'pandas.pydata.org', 'version': 0.4}]}

In [15]:
collection.get(
    where_document={"$not_contains":"document"}
)

{'ids': ['id2', 'id3'],
 'embeddings': None,
 'documents': ['This is a reading about LlamaIndex',
  'This is a book about Python'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'llamaindex.ai', 'version': 0.2},
  {'version': 0.3, 'source': 'python.org'}]}

In [16]:
# Combine Metadata and Document Content Filters

collection.get(
    where={"version": {"$gt": 0.1}},
    where_document={
        "$or": [
            {"$contains": "LangChain"},
            {"$contains": "Python"}
        ]
    }
)

{'ids': ['id3', 'id5'],
 'embeddings': None,
 'documents': ['This is a book about Python',
  'This is another document about LangChain'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'version': 0.3, 'source': 'python.org'},
  {'version': 0.5, 'source': 'langchain.com'}]}

In [17]:
collection.add(
    documents=[
        "suka lingkungan tenang dan aman",
        "ingin dekat transportasi umum",
        "inginnya dekat sekolah",
        "butuh unit baru full furnished"
    ],
    metadatas=[
        {"name": "Anton", "city": "Surabaya", "budget": 800, "needs": "rumah cluster"},
        {"name": "Budi", "city": "Jakarta", "budget": 1200, "needs": "apartemen dekat kantor"},
        {"name": "Citra", "city": "Bandung", "budget": 600, "needs": "rumah minimalis"},
        {"name": "Dewi", "city": "Jakarta", "budget": 1500, "needs": "apartemen"},
    ],
    ids=["u1","u2","u3","u4"]
)


In [18]:
collection.get(
    where={"city": "Jakarta"}
)

{'ids': ['u2', 'u4'],
 'embeddings': None,
 'documents': ['ingin dekat transportasi umum',
  'butuh unit baru full furnished'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'city': 'Jakarta',
   'needs': 'apartemen dekat kantor',
   'budget': 1200,
   'name': 'Budi'},
  {'budget': 1500, 'city': 'Jakarta', 'name': 'Dewi', 'needs': 'apartemen'}]}

In [19]:
collection.get(
    where={
        "$and": [
            {"budget": {"$gte": 600}},
            {"budget": {"$lte": 1200}}
        ]
    },
    where_document={"$contains": "suka"}
)

{'ids': ['u1'],
 'embeddings': None,
 'documents': ['suka lingkungan tenang dan aman'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'budget': 800,
   'city': 'Surabaya',
   'needs': 'rumah cluster',
   'name': 'Anton'}]}