Skip to content

Commit 87d4a36

Browse files
authored
fix: sometimes memories gets extracted in the wrong language (#184)
1 parent 0bf85a3 commit 87d4a36

File tree

2 files changed

+160
-1
lines changed

2 files changed

+160
-1
lines changed

hindsight-api/hindsight_api/engine/retain/fact_extraction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def _chunk_conversation(turns: list[dict], max_chars: int) -> list[str]:
441441
# Concise extraction prompt (default) - selective, high-quality facts
442442
CONCISE_FACT_EXTRACTION_PROMPT = """Extract SIGNIFICANT facts from text. Be SELECTIVE - only extract facts worth remembering long-term.
443443
444-
LANGUAGE RULE (CRITICAL): Output facts in the EXACT SAME language as the input text. If input is Japanese, output Japanese. If input is Chinese, output Chinese. NEVER translate to English. Preserve original language completely.
444+
LANGUAGE REQUIREMENT: Detect the language of the input text. All extracted facts, entity names, descriptions, and other output MUST be in the SAME language as the input. Do not translate to another language.
445445
446446
{fact_types_instruction}
447447

hindsight-api/tests/test_multilingual.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,165 @@ async def test_retain_japanese_content(memory, request_context):
275275
pass
276276

277277

278+
@pytest.mark.asyncio
279+
async def test_english_content_stays_english(memory, request_context):
280+
"""
281+
Test that English content is NOT incorrectly translated to Japanese or Chinese.
282+
283+
This test specifically catches the bug where the language instruction in the
284+
CONCISE extraction prompt mentioned Japanese/Chinese explicitly, which primed
285+
the LLM to sometimes output facts in those languages even for English input.
286+
287+
See: https://github.com/vectorize-io/hindsight/issues/181
288+
"""
289+
bank_id = f"test_english_retain_{datetime.now(timezone.utc).timestamp()}"
290+
291+
try:
292+
# English content about a developer
293+
english_content = """
294+
John Smith is a software engineer at TechCorp in Seattle.
295+
He specializes in machine learning and has been working on
296+
recommendation systems for the past three years.
297+
Last month, he launched a new feature that improved click-through rates by 25%.
298+
He prefers working in Python and uses PyTorch for model training.
299+
"""
300+
301+
unit_ids = await memory.retain_async(
302+
bank_id=bank_id,
303+
content=english_content,
304+
context="Team profile",
305+
event_date=datetime(2024, 1, 15, tzinfo=timezone.utc),
306+
request_context=request_context,
307+
)
308+
309+
logger.info(f"Retained {len(unit_ids)} facts from English content")
310+
assert len(unit_ids) > 0, "Should have extracted facts from English content"
311+
312+
# Recall with English query
313+
result = await memory.recall_async(
314+
bank_id=bank_id,
315+
query="Tell me about John Smith",
316+
budget=Budget.MID,
317+
max_tokens=1000,
318+
fact_type=["world"],
319+
request_context=request_context,
320+
)
321+
322+
assert len(result.results) > 0, "Should recall facts about John Smith"
323+
324+
# Verify facts are NOT in Japanese or Chinese
325+
for fact in result.results:
326+
logger.info(f"Fact: {fact.text}")
327+
328+
# Count Japanese characters (hiragana, katakana)
329+
japanese_chars = sum(
330+
1 for char in fact.text
331+
if ("\u3040" <= char <= "\u309f") or ("\u30a0" <= char <= "\u30ff")
332+
)
333+
334+
# Count Chinese/CJK characters (excluding those also used in Japanese)
335+
# Note: Kanji/CJK ideographs overlap between Chinese and Japanese
336+
cjk_chars = sum(1 for char in fact.text if "\u4e00" <= char <= "\u9fff")
337+
338+
# For English input, there should be minimal CJK characters
339+
# Allow for occasional edge cases (e.g., proper nouns) but not full translation
340+
total_chars = len(fact.text)
341+
cjk_ratio = cjk_chars / max(total_chars, 1)
342+
343+
assert cjk_ratio < 0.1, (
344+
f"English content was incorrectly translated to CJK language! "
345+
f"CJK ratio: {cjk_ratio:.1%}, Japanese chars: {japanese_chars}, CJK chars: {cjk_chars}. "
346+
f"Fact: {fact.text}"
347+
)
348+
349+
logger.info("English content test passed - facts stayed in English")
350+
351+
finally:
352+
await memory.delete_bank(bank_id, request_context=request_context)
353+
354+
355+
@pytest.mark.asyncio
356+
async def test_italian_content_stays_italian(memory, request_context):
357+
"""
358+
Test that Italian content is NOT incorrectly translated to Japanese or Chinese.
359+
360+
Similar to the English test, this catches the bug where non-CJK languages
361+
could be incorrectly translated due to biased language instruction.
362+
363+
See: https://github.com/vectorize-io/hindsight/issues/181
364+
"""
365+
bank_id = f"test_italian_retain_{datetime.now(timezone.utc).timestamp()}"
366+
367+
try:
368+
# Italian content about a chef
369+
italian_content = """
370+
Marco Rossi è uno chef italiano che lavora in un ristorante a Milano.
371+
È specializzato nella cucina toscana e ha vinto tre premi gastronomici.
372+
Il mese scorso ha aperto un nuovo ristorante nel centro della città.
373+
Preferisce usare ingredienti freschi e locali per i suoi piatti.
374+
"""
375+
376+
unit_ids = await memory.retain_async(
377+
bank_id=bank_id,
378+
content=italian_content,
379+
context="Profilo dello chef",
380+
event_date=datetime(2024, 1, 15, tzinfo=timezone.utc),
381+
request_context=request_context,
382+
)
383+
384+
logger.info(f"Retained {len(unit_ids)} facts from Italian content")
385+
assert len(unit_ids) > 0, "Should have extracted facts from Italian content"
386+
387+
# Recall with Italian query
388+
result = await memory.recall_async(
389+
bank_id=bank_id,
390+
query="Dimmi di Marco Rossi", # "Tell me about Marco Rossi"
391+
budget=Budget.MID,
392+
max_tokens=1000,
393+
fact_type=["world"],
394+
request_context=request_context,
395+
)
396+
397+
assert len(result.results) > 0, "Should recall facts about Marco Rossi"
398+
399+
# Verify facts are NOT in Japanese or Chinese - should stay in Italian
400+
for fact in result.results:
401+
logger.info(f"Fact: {fact.text}")
402+
403+
# Count CJK characters
404+
cjk_chars = sum(1 for char in fact.text if "\u4e00" <= char <= "\u9fff")
405+
japanese_chars = sum(
406+
1 for char in fact.text
407+
if ("\u3040" <= char <= "\u309f") or ("\u30a0" <= char <= "\u30ff")
408+
)
409+
410+
total_chars = len(fact.text)
411+
cjk_ratio = (cjk_chars + japanese_chars) / max(total_chars, 1)
412+
413+
assert cjk_ratio < 0.1, (
414+
f"Italian content was incorrectly translated to CJK language! "
415+
f"CJK ratio: {cjk_ratio:.1%}. Fact: {fact.text}"
416+
)
417+
418+
# Verify facts contain Italian words (basic sanity check)
419+
all_text = " ".join(f.text for f in result.results).lower()
420+
italian_indicators = ["marco", "rossi", "chef", "ristorante", "milano", "cucina", "italiano", "italiana"]
421+
has_italian = any(word in all_text for word in italian_indicators)
422+
423+
# Allow English translation as acceptable (not ideal but not the bug)
424+
english_indicators = ["chef", "restaurant", "milan", "italian", "cooking"]
425+
has_english = any(word in all_text for word in english_indicators)
426+
427+
assert has_italian or has_english, (
428+
f"Expected facts to be in Italian or English, but got neither. Facts: {all_text}"
429+
)
430+
431+
logger.info("Italian content test passed - facts not translated to CJK")
432+
433+
finally:
434+
await memory.delete_bank(bank_id, request_context=request_context)
435+
436+
278437
@pytest.mark.asyncio
279438
async def test_mixed_language_entities(memory, request_context):
280439
"""

0 commit comments

Comments
 (0)