fix: sanitize null bytes from text fields before PostgreSQL insertion (#238)

slayoffer · claude · web-flow · commit ef9d3a15cb05 · 2026-01-31T09:16:23.000+01:00
* fix: sanitize null bytes from text fields before PostgreSQL insertion

Fixes 'invalid byte sequence for encoding UTF8: 0x00' error during batch retain

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* refactor: consolidate _sanitize_text into fact_extraction module

Address review feedback: reuse existing _sanitize_text from fact_extraction
instead of duplicating in fact_storage.

The consolidated function now handles both:
- Null bytes (\x00) for PostgreSQL compatibility
- Unicode surrogates (U+D800-U+DFFF) for UTF-8/LLM API compatibility

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/hindsight-api/hindsight_api/engine/retain/fact_extraction.py b/hindsight-api/hindsight_api/engine/retain/fact_extraction.py
@@ -57,21 +57,25 @@ def _infer_temporal_date(fact_text: str, event_date: datetime) -> str | None:
     return None
 
 
-def _sanitize_text(text: str) -> str:
+def _sanitize_text(text: str | None) -> str | None:
     """
-    Sanitize text by removing invalid Unicode surrogate characters.
+    Sanitize text by removing characters that break downstream systems.
 
-    Surrogate characters (U+D800 to U+DFFF) are used in UTF-16 encoding
-    but cannot be encoded in UTF-8. They can appear in Python strings
-    from improperly decoded data (e.g., from JavaScript or broken files).
+    Removes:
+    - Null bytes (\\x00): Invalid in PostgreSQL UTF-8 encoding
+    - Unicode surrogates (U+D800-U+DFFF): Invalid in UTF-8, break LLM APIs
 
-    This function removes unpaired surrogates to prevent UnicodeEncodeError
-    when the text is sent to the LLM API.
+    Surrogate characters are used in UTF-16 encoding but cannot be encoded
+    in UTF-8. They can appear in Python strings from improperly decoded data
+    (e.g., from JavaScript or broken files). Null bytes commonly appear in
+    OCR output, PDF extraction, or copy-paste from binary sources.
     """
+    if text is None:
+        return None
     if not text:
         return text
-    # Remove surrogate characters (U+D800 to U+DFFF) using regex
-    # These are invalid in UTF-8 and cause encoding errors
+    # Remove null bytes and surrogate characters
+    text = text.replace("\x00", "")
     return re.sub(r"[\ud800-\udfff]", "", text)
 
 
diff --git a/hindsight-api/hindsight_api/engine/retain/fact_storage.py b/hindsight-api/hindsight_api/engine/retain/fact_storage.py
@@ -8,6 +8,7 @@
 import logging
 
 from ..memory_engine import fq_table
+from .fact_extraction import _sanitize_text
 from .types import ProcessedFact
 
 logger = logging.getLogger(__name__)
@@ -47,7 +48,7 @@ async def insert_facts_batch(
     tags_list = []
 
     for fact in facts:
-        fact_texts.append(fact.fact_text)
+        fact_texts.append(_sanitize_text(fact.fact_text))
         # Convert embedding to string for asyncpg vector type
         embeddings.append(str(fact.embedding))
         # event_date: Use occurred_start if available, otherwise use mentioned_at
@@ -56,7 +57,7 @@ async def insert_facts_batch(
         occurred_starts.append(fact.occurred_start)
         occurred_ends.append(fact.occurred_end)
         mentioned_ats.append(fact.mentioned_at)
-        contexts.append(fact.context)
+        contexts.append(_sanitize_text(fact.context))
         fact_types.append(fact.fact_type)
         # confidence_score is only for opinion facts
         confidence_scores.append(1.0 if fact.fact_type == "opinion" else None)
@@ -157,7 +158,8 @@ async def handle_document_tracking(
     """
     import hashlib
 
-    # Calculate content hash
+    # Sanitize and calculate content hash
+    combined_content = _sanitize_text(combined_content) or ""
     content_hash = hashlib.sha256(combined_content.encode()).hexdigest()
 
     # Always delete old document first if it exists (cascades to units and links)