Skip to content

Commit 1d70abf

Browse files
authored
feat: add tags filtering and q description fix for list documents API (#468)
* feat: add Pydantic AI integration to CI, release pipeline, and docs - Add test-pydantic-ai-integration job to CI (test.yml) - Add build, publish, and artifact steps to release workflow (release.yml) - Add hindsight-integrations/pydantic-ai to release.sh version bumping - Add Pydantic AI documentation page (sdks/integrations/pydantic-ai.md) - Add Pydantic AI entry to sidebar with icon * docs: remove Requirements section from pydantic-ai integration page * feat: add tags filtering and fix offset pagination docs for list documents API - Add `tags` and `tags_match` query params to GET /banks/{bank_id}/documents - Supports any, all, any_strict, all_strict matching modes (default: any_strict) - Fix `q` param description — it's a case-insensitive substring match on document ID only - Add tests for offset pagination and all tags_match modes - Regenerate OpenAPI spec and Python/TypeScript/Go clients - Document the new filtering options in docs/developer/api/documents.mdx * fix(cli): pass new tags/tags_match args to list_documents
1 parent ecf609c commit 1d70abf

File tree

13 files changed

+490
-17
lines changed

13 files changed

+490
-17
lines changed

hindsight-api/hindsight_api/api/http.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3064,7 +3064,13 @@ async def api_delete_directive(
30643064
)
30653065
async def api_list_documents(
30663066
bank_id: str,
3067-
q: str | None = None,
3067+
q: str | None = Query(
3068+
None, description="Case-insensitive substring filter on document ID (e.g. 'report' matches 'report-2024')"
3069+
),
3070+
tags: list[str] | None = Query(None, description="Filter documents by tags"),
3071+
tags_match: str = Query(
3072+
"any_strict", description="How to match tags: 'any', 'all', 'any_strict', 'all_strict'"
3073+
),
30683074
limit: int = 100,
30693075
offset: int = 0,
30703076
request_context: RequestContext = Depends(get_request_context),
@@ -3074,13 +3080,21 @@ async def api_list_documents(
30743080
30753081
Args:
30763082
bank_id: Memory Bank ID (from path)
3077-
q: Search query (searches document ID and metadata)
3083+
q: Case-insensitive substring filter on document ID
3084+
tags: Filter documents by tags
3085+
tags_match: How to match tags (any, all, any_strict, all_strict)
30783086
limit: Maximum number of results (default: 100)
30793087
offset: Offset for pagination (default: 0)
30803088
"""
30813089
try:
30823090
data = await app.state.memory.list_documents(
3083-
bank_id=bank_id, search_query=q, limit=limit, offset=offset, request_context=request_context
3091+
bank_id=bank_id,
3092+
search_query=q,
3093+
tags=tags,
3094+
tags_match=tags_match,
3095+
limit=limit,
3096+
offset=offset,
3097+
request_context=request_context,
30843098
)
30853099
return data
30863100
except OperationValidationError as e:

hindsight-api/hindsight_api/engine/interface.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
if TYPE_CHECKING:
1313
from hindsight_api.engine.memory_engine import Budget
1414
from hindsight_api.engine.response_models import RecallResult, ReflectResult
15+
from hindsight_api.engine.search.tags import TagsMatch
1516
from hindsight_api.models import RequestContext
1617

1718

@@ -337,6 +338,8 @@ async def list_documents(
337338
bank_id: str,
338339
*,
339340
search_query: str | None = None,
341+
tags: list[str] | None = None,
342+
tags_match: "TagsMatch" = "any_strict",
340343
limit: int = 100,
341344
offset: int = 0,
342345
request_context: "RequestContext",
@@ -346,7 +349,9 @@ async def list_documents(
346349
347350
Args:
348351
bank_id: The memory bank ID.
349-
search_query: Search query.
352+
search_query: Case-insensitive substring filter on document ID.
353+
tags: Filter by tags.
354+
tags_match: How to match tags (any, all, any_strict, all_strict).
350355
limit: Maximum results.
351356
offset: Pagination offset.
352357
request_context: Request context for authentication.

hindsight-api/hindsight_api/engine/memory_engine.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def validate_sql_schema(sql: str) -> None:
184184
from .retain.types import RetainContentDict
185185
from .search import think_utils
186186
from .search.reranking import CrossEncoderReranker
187-
from .search.tags import TagsMatch
187+
from .search.tags import TagsMatch, build_tags_where_clause
188188
from .task_backend import BrokerTaskBackend, SyncTaskBackend, TaskBackend
189189

190190

@@ -4142,6 +4142,8 @@ async def list_documents(
41424142
bank_id: str,
41434143
*,
41444144
search_query: str | None = None,
4145+
tags: list[str] | None = None,
4146+
tags_match: "TagsMatch" = "any_strict",
41454147
limit: int = 100,
41464148
offset: int = 0,
41474149
request_context: "RequestContext",
@@ -4152,6 +4154,8 @@ async def list_documents(
41524154
Args:
41534155
bank_id: bank ID (required)
41544156
search_query: Search in document ID
4157+
tags: Filter by tags
4158+
tags_match: How to match tags (any, all, any_strict, all_strict)
41554159
limit: Maximum number of results
41564160
offset: Offset for pagination
41574161
request_context: Request context for authentication.
@@ -4182,7 +4186,16 @@ async def list_documents(
41824186
query_conditions.append(f"id ILIKE ${param_count}")
41834187
query_params.append(f"%{search_query}%")
41844188

4189+
tags_clause, tags_params, next_param = build_tags_where_clause(
4190+
tags, param_offset=param_count + 1, match=tags_match
4191+
)
4192+
query_params.extend(tags_params)
4193+
param_count = next_param - 1 # next_param is next available; convert to last used
4194+
41854195
where_clause = "WHERE " + " AND ".join(query_conditions) if query_conditions else ""
4196+
if tags_clause:
4197+
# tags_clause starts with "AND", append after WHERE conditions
4198+
where_clause = where_clause + " " + tags_clause if where_clause else "WHERE " + tags_clause[4:].lstrip()
41864199

41874200
# Get total count
41884201
count_query = f"""
@@ -6038,8 +6051,6 @@ async def list_directives(
60386051

60396052
async with acquire_with_retry(pool) as conn:
60406053
# Build filters
6041-
from .search.tags import build_tags_where_clause
6042-
60436054
filters = ["bank_id = $1"]
60446055
params: list[Any] = [bank_id]
60456056
param_idx = 2
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
"""
2+
Tests for list_documents pagination and tags filtering.
3+
"""
4+
from datetime import datetime, timezone
5+
6+
import pytest
7+
8+
9+
async def _retain_doc(memory, bank_id, document_id, tags, request_context):
10+
"""Helper to retain a document with given tags. Uses gibberish content to avoid LLM
11+
fact extraction (documents are persisted even with zero facts)."""
12+
await memory.retain_batch_async(
13+
bank_id=bank_id,
14+
contents=[{"content": f"xyzabc123 !@# $$$ {document_id}"}],
15+
document_id=document_id,
16+
document_tags=tags or None,
17+
request_context=request_context,
18+
)
19+
20+
21+
@pytest.mark.asyncio
22+
async def test_list_documents_offset_pagination(memory, request_context):
23+
"""offset parameter returns the correct slice of documents."""
24+
bank_id = f"test_list_docs_offset_{datetime.now(timezone.utc).timestamp()}"
25+
26+
try:
27+
for i in range(4):
28+
await _retain_doc(memory, bank_id, f"doc-{i:02d}", [], request_context)
29+
30+
# All documents, ordered by created_at DESC → doc-03, doc-02, doc-01, doc-00
31+
all_docs = await memory.list_documents(
32+
bank_id=bank_id, limit=10, offset=0, request_context=request_context
33+
)
34+
assert all_docs["total"] == 4
35+
assert len(all_docs["items"]) == 4
36+
all_ids = [d["id"] for d in all_docs["items"]]
37+
38+
# offset=2 should skip the first two and return the remaining two
39+
page2 = await memory.list_documents(
40+
bank_id=bank_id, limit=10, offset=2, request_context=request_context
41+
)
42+
assert page2["total"] == 4 # total is always the full count
43+
assert len(page2["items"]) == 2
44+
assert [d["id"] for d in page2["items"]] == all_ids[2:]
45+
46+
# offset beyond total returns empty items but correct total
47+
beyond = await memory.list_documents(
48+
bank_id=bank_id, limit=10, offset=10, request_context=request_context
49+
)
50+
assert beyond["total"] == 4
51+
assert beyond["items"] == []
52+
53+
finally:
54+
await memory.delete_bank(bank_id, request_context=request_context)
55+
56+
57+
@pytest.mark.asyncio
58+
async def test_list_documents_tags_filter_any_strict(memory, request_context):
59+
"""tags filter with any_strict returns only tagged documents that match."""
60+
bank_id = f"test_list_docs_tags_{datetime.now(timezone.utc).timestamp()}"
61+
62+
try:
63+
await _retain_doc(memory, bank_id, "doc-alpha", ["team-a"], request_context)
64+
await _retain_doc(memory, bank_id, "doc-beta", ["team-b"], request_context)
65+
await _retain_doc(memory, bank_id, "doc-both", ["team-a", "team-b"], request_context)
66+
await _retain_doc(memory, bank_id, "doc-untagged", [], request_context)
67+
68+
# any_strict: only docs with at least one of the given tags, untagged excluded
69+
result = await memory.list_documents(
70+
bank_id=bank_id,
71+
tags=["team-a"],
72+
tags_match="any_strict",
73+
request_context=request_context,
74+
)
75+
ids = {d["id"] for d in result["items"]}
76+
assert ids == {"doc-alpha", "doc-both"}
77+
assert result["total"] == 2
78+
79+
finally:
80+
await memory.delete_bank(bank_id, request_context=request_context)
81+
82+
83+
@pytest.mark.asyncio
84+
async def test_list_documents_tags_filter_any_includes_untagged(memory, request_context):
85+
"""tags filter with 'any' mode includes untagged documents."""
86+
bank_id = f"test_list_docs_tags_any_{datetime.now(timezone.utc).timestamp()}"
87+
88+
try:
89+
await _retain_doc(memory, bank_id, "doc-tagged", ["team-a"], request_context)
90+
await _retain_doc(memory, bank_id, "doc-other", ["team-b"], request_context)
91+
await _retain_doc(memory, bank_id, "doc-untagged", [], request_context)
92+
93+
result = await memory.list_documents(
94+
bank_id=bank_id,
95+
tags=["team-a"],
96+
tags_match="any",
97+
request_context=request_context,
98+
)
99+
ids = {d["id"] for d in result["items"]}
100+
# "any" includes untagged + matching tagged
101+
assert "doc-tagged" in ids
102+
assert "doc-untagged" in ids
103+
assert "doc-other" not in ids
104+
105+
finally:
106+
await memory.delete_bank(bank_id, request_context=request_context)
107+
108+
109+
@pytest.mark.asyncio
110+
async def test_list_documents_tags_filter_all_strict(memory, request_context):
111+
"""tags filter with all_strict returns only docs that have ALL the specified tags."""
112+
bank_id = f"test_list_docs_tags_all_{datetime.now(timezone.utc).timestamp()}"
113+
114+
try:
115+
await _retain_doc(memory, bank_id, "doc-a-only", ["team-a"], request_context)
116+
await _retain_doc(memory, bank_id, "doc-a-and-b", ["team-a", "team-b"], request_context)
117+
await _retain_doc(memory, bank_id, "doc-untagged", [], request_context)
118+
119+
result = await memory.list_documents(
120+
bank_id=bank_id,
121+
tags=["team-a", "team-b"],
122+
tags_match="all_strict",
123+
request_context=request_context,
124+
)
125+
ids = {d["id"] for d in result["items"]}
126+
assert ids == {"doc-a-and-b"}
127+
128+
finally:
129+
await memory.delete_bank(bank_id, request_context=request_context)
130+
131+
132+
@pytest.mark.asyncio
133+
async def test_list_documents_no_tags_filter_returns_all(memory, request_context):
134+
"""When no tags filter is specified, all documents are returned."""
135+
bank_id = f"test_list_docs_no_tags_{datetime.now(timezone.utc).timestamp()}"
136+
137+
try:
138+
await _retain_doc(memory, bank_id, "doc-tagged", ["team-a"], request_context)
139+
await _retain_doc(memory, bank_id, "doc-untagged", [], request_context)
140+
141+
result = await memory.list_documents(
142+
bank_id=bank_id,
143+
tags=None,
144+
request_context=request_context,
145+
)
146+
ids = {d["id"] for d in result["items"]}
147+
assert ids == {"doc-tagged", "doc-untagged"}
148+
149+
finally:
150+
await memory.delete_bank(bank_id, request_context=request_context)
151+
152+
153+
@pytest.mark.asyncio
154+
async def test_list_documents_tags_and_search_query_combined(memory, request_context):
155+
"""tags filter and q (search_query) can be combined."""
156+
bank_id = f"test_list_docs_tags_q_{datetime.now(timezone.utc).timestamp()}"
157+
158+
try:
159+
await _retain_doc(memory, bank_id, "report-2024", ["team-a"], request_context)
160+
await _retain_doc(memory, bank_id, "report-2025", ["team-b"], request_context)
161+
await _retain_doc(memory, bank_id, "summary-2024", ["team-a"], request_context)
162+
163+
result = await memory.list_documents(
164+
bank_id=bank_id,
165+
search_query="report",
166+
tags=["team-a"],
167+
tags_match="any_strict",
168+
request_context=request_context,
169+
)
170+
ids = {d["id"] for d in result["items"]}
171+
assert ids == {"report-2024"}
172+
173+
finally:
174+
await memory.delete_bank(bank_id, request_context=request_context)

hindsight-cli/src/api.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ impl ApiClient {
300300
offset.map(|o| o as i64),
301301
q,
302302
None,
303+
None,
304+
None,
303305
).await?;
304306
Ok(response.into_inner())
305307
})

hindsight-clients/go/api/openapi.yaml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1173,14 +1173,38 @@ paths:
11731173
title: Bank Id
11741174
type: string
11751175
style: simple
1176-
- explode: true
1176+
- description: Case-insensitive substring filter on document ID (e.g. 'report'
1177+
matches 'report-2024')
1178+
explode: true
11771179
in: query
11781180
name: q
11791181
required: false
11801182
schema:
11811183
nullable: true
11821184
type: string
11831185
style: form
1186+
- description: Filter documents by tags
1187+
explode: true
1188+
in: query
1189+
name: tags
1190+
required: false
1191+
schema:
1192+
items:
1193+
type: string
1194+
nullable: true
1195+
type: array
1196+
style: form
1197+
- description: "How to match tags: 'any', 'all', 'any_strict', 'all_strict'"
1198+
explode: true
1199+
in: query
1200+
name: tags_match
1201+
required: false
1202+
schema:
1203+
default: any_strict
1204+
description: "How to match tags: 'any', 'all', 'any_strict', 'all_strict'"
1205+
title: Tags Match
1206+
type: string
1207+
style: form
11841208
- explode: true
11851209
in: query
11861210
name: limit

0 commit comments

Comments
 (0)