Skip to content

Commit ef9d3a1

Browse files
slayofferclaude
andauthored
fix: sanitize null bytes from text fields before PostgreSQL insertion (#238)
* fix: sanitize null bytes from text fields before PostgreSQL insertion Fixes 'invalid byte sequence for encoding UTF8: 0x00' error during batch retain Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * refactor: consolidate _sanitize_text into fact_extraction module Address review feedback: reuse existing _sanitize_text from fact_extraction instead of duplicating in fact_storage. The consolidated function now handles both: - Null bytes (\x00) for PostgreSQL compatibility - Unicode surrogates (U+D800-U+DFFF) for UTF-8/LLM API compatibility Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent d788a55 commit ef9d3a1

File tree

2 files changed

+18
-12
lines changed

2 files changed

+18
-12
lines changed

hindsight-api/hindsight_api/engine/retain/fact_extraction.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,25 @@ def _infer_temporal_date(fact_text: str, event_date: datetime) -> str | None:
5757
return None
5858

5959

60-
def _sanitize_text(text: str) -> str:
60+
def _sanitize_text(text: str | None) -> str | None:
6161
"""
62-
Sanitize text by removing invalid Unicode surrogate characters.
62+
Sanitize text by removing characters that break downstream systems.
6363
64-
Surrogate characters (U+D800 to U+DFFF) are used in UTF-16 encoding
65-
but cannot be encoded in UTF-8. They can appear in Python strings
66-
from improperly decoded data (e.g., from JavaScript or broken files).
64+
Removes:
65+
- Null bytes (\\x00): Invalid in PostgreSQL UTF-8 encoding
66+
- Unicode surrogates (U+D800-U+DFFF): Invalid in UTF-8, break LLM APIs
6767
68-
This function removes unpaired surrogates to prevent UnicodeEncodeError
69-
when the text is sent to the LLM API.
68+
Surrogate characters are used in UTF-16 encoding but cannot be encoded
69+
in UTF-8. They can appear in Python strings from improperly decoded data
70+
(e.g., from JavaScript or broken files). Null bytes commonly appear in
71+
OCR output, PDF extraction, or copy-paste from binary sources.
7072
"""
73+
if text is None:
74+
return None
7175
if not text:
7276
return text
73-
# Remove surrogate characters (U+D800 to U+DFFF) using regex
74-
# These are invalid in UTF-8 and cause encoding errors
77+
# Remove null bytes and surrogate characters
78+
text = text.replace("\x00", "")
7579
return re.sub(r"[\ud800-\udfff]", "", text)
7680

7781

hindsight-api/hindsight_api/engine/retain/fact_storage.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import logging
99

1010
from ..memory_engine import fq_table
11+
from .fact_extraction import _sanitize_text
1112
from .types import ProcessedFact
1213

1314
logger = logging.getLogger(__name__)
@@ -47,7 +48,7 @@ async def insert_facts_batch(
4748
tags_list = []
4849

4950
for fact in facts:
50-
fact_texts.append(fact.fact_text)
51+
fact_texts.append(_sanitize_text(fact.fact_text))
5152
# Convert embedding to string for asyncpg vector type
5253
embeddings.append(str(fact.embedding))
5354
# event_date: Use occurred_start if available, otherwise use mentioned_at
@@ -56,7 +57,7 @@ async def insert_facts_batch(
5657
occurred_starts.append(fact.occurred_start)
5758
occurred_ends.append(fact.occurred_end)
5859
mentioned_ats.append(fact.mentioned_at)
59-
contexts.append(fact.context)
60+
contexts.append(_sanitize_text(fact.context))
6061
fact_types.append(fact.fact_type)
6162
# confidence_score is only for opinion facts
6263
confidence_scores.append(1.0 if fact.fact_type == "opinion" else None)
@@ -157,7 +158,8 @@ async def handle_document_tracking(
157158
"""
158159
import hashlib
159160

160-
# Calculate content hash
161+
# Sanitize and calculate content hash
162+
combined_content = _sanitize_text(combined_content) or ""
161163
content_hash = hashlib.sha256(combined_content.encode()).hexdigest()
162164

163165
# Always delete old document first if it exists (cascades to units and links)

0 commit comments

Comments
 (0)