@@ -275,6 +275,165 @@ async def test_retain_japanese_content(memory, request_context):
275275 pass
276276
277277
278+ @pytest .mark .asyncio
279+ async def test_english_content_stays_english (memory , request_context ):
280+ """
281+ Test that English content is NOT incorrectly translated to Japanese or Chinese.
282+
283+ This test specifically catches the bug where the language instruction in the
284+ CONCISE extraction prompt mentioned Japanese/Chinese explicitly, which primed
285+ the LLM to sometimes output facts in those languages even for English input.
286+
287+ See: https://github.com/vectorize-io/hindsight/issues/181
288+ """
289+ bank_id = f"test_english_retain_{ datetime .now (timezone .utc ).timestamp ()} "
290+
291+ try :
292+ # English content about a developer
293+ english_content = """
294+ John Smith is a software engineer at TechCorp in Seattle.
295+ He specializes in machine learning and has been working on
296+ recommendation systems for the past three years.
297+ Last month, he launched a new feature that improved click-through rates by 25%.
298+ He prefers working in Python and uses PyTorch for model training.
299+ """
300+
301+ unit_ids = await memory .retain_async (
302+ bank_id = bank_id ,
303+ content = english_content ,
304+ context = "Team profile" ,
305+ event_date = datetime (2024 , 1 , 15 , tzinfo = timezone .utc ),
306+ request_context = request_context ,
307+ )
308+
309+ logger .info (f"Retained { len (unit_ids )} facts from English content" )
310+ assert len (unit_ids ) > 0 , "Should have extracted facts from English content"
311+
312+ # Recall with English query
313+ result = await memory .recall_async (
314+ bank_id = bank_id ,
315+ query = "Tell me about John Smith" ,
316+ budget = Budget .MID ,
317+ max_tokens = 1000 ,
318+ fact_type = ["world" ],
319+ request_context = request_context ,
320+ )
321+
322+ assert len (result .results ) > 0 , "Should recall facts about John Smith"
323+
324+ # Verify facts are NOT in Japanese or Chinese
325+ for fact in result .results :
326+ logger .info (f"Fact: { fact .text } " )
327+
328+ # Count Japanese characters (hiragana, katakana)
329+ japanese_chars = sum (
330+ 1 for char in fact .text
331+ if ("\u3040 " <= char <= "\u309f " ) or ("\u30a0 " <= char <= "\u30ff " )
332+ )
333+
334+ # Count Chinese/CJK characters (excluding those also used in Japanese)
335+ # Note: Kanji/CJK ideographs overlap between Chinese and Japanese
336+ cjk_chars = sum (1 for char in fact .text if "\u4e00 " <= char <= "\u9fff " )
337+
338+ # For English input, there should be minimal CJK characters
339+ # Allow for occasional edge cases (e.g., proper nouns) but not full translation
340+ total_chars = len (fact .text )
341+ cjk_ratio = cjk_chars / max (total_chars , 1 )
342+
343+ assert cjk_ratio < 0.1 , (
344+ f"English content was incorrectly translated to CJK language! "
345+ f"CJK ratio: { cjk_ratio :.1%} , Japanese chars: { japanese_chars } , CJK chars: { cjk_chars } . "
346+ f"Fact: { fact .text } "
347+ )
348+
349+ logger .info ("English content test passed - facts stayed in English" )
350+
351+ finally :
352+ await memory .delete_bank (bank_id , request_context = request_context )
353+
354+
355+ @pytest .mark .asyncio
356+ async def test_italian_content_stays_italian (memory , request_context ):
357+ """
358+ Test that Italian content is NOT incorrectly translated to Japanese or Chinese.
359+
360+ Similar to the English test, this catches the bug where non-CJK languages
361+ could be incorrectly translated due to biased language instruction.
362+
363+ See: https://github.com/vectorize-io/hindsight/issues/181
364+ """
365+ bank_id = f"test_italian_retain_{ datetime .now (timezone .utc ).timestamp ()} "
366+
367+ try :
368+ # Italian content about a chef
369+ italian_content = """
370+ Marco Rossi è uno chef italiano che lavora in un ristorante a Milano.
371+ È specializzato nella cucina toscana e ha vinto tre premi gastronomici.
372+ Il mese scorso ha aperto un nuovo ristorante nel centro della città.
373+ Preferisce usare ingredienti freschi e locali per i suoi piatti.
374+ """
375+
376+ unit_ids = await memory .retain_async (
377+ bank_id = bank_id ,
378+ content = italian_content ,
379+ context = "Profilo dello chef" ,
380+ event_date = datetime (2024 , 1 , 15 , tzinfo = timezone .utc ),
381+ request_context = request_context ,
382+ )
383+
384+ logger .info (f"Retained { len (unit_ids )} facts from Italian content" )
385+ assert len (unit_ids ) > 0 , "Should have extracted facts from Italian content"
386+
387+ # Recall with Italian query
388+ result = await memory .recall_async (
389+ bank_id = bank_id ,
390+ query = "Dimmi di Marco Rossi" , # "Tell me about Marco Rossi"
391+ budget = Budget .MID ,
392+ max_tokens = 1000 ,
393+ fact_type = ["world" ],
394+ request_context = request_context ,
395+ )
396+
397+ assert len (result .results ) > 0 , "Should recall facts about Marco Rossi"
398+
399+ # Verify facts are NOT in Japanese or Chinese - should stay in Italian
400+ for fact in result .results :
401+ logger .info (f"Fact: { fact .text } " )
402+
403+ # Count CJK characters
404+ cjk_chars = sum (1 for char in fact .text if "\u4e00 " <= char <= "\u9fff " )
405+ japanese_chars = sum (
406+ 1 for char in fact .text
407+ if ("\u3040 " <= char <= "\u309f " ) or ("\u30a0 " <= char <= "\u30ff " )
408+ )
409+
410+ total_chars = len (fact .text )
411+ cjk_ratio = (cjk_chars + japanese_chars ) / max (total_chars , 1 )
412+
413+ assert cjk_ratio < 0.1 , (
414+ f"Italian content was incorrectly translated to CJK language! "
415+ f"CJK ratio: { cjk_ratio :.1%} . Fact: { fact .text } "
416+ )
417+
418+ # Verify facts contain Italian words (basic sanity check)
419+ all_text = " " .join (f .text for f in result .results ).lower ()
420+ italian_indicators = ["marco" , "rossi" , "chef" , "ristorante" , "milano" , "cucina" , "italiano" , "italiana" ]
421+ has_italian = any (word in all_text for word in italian_indicators )
422+
423+ # Allow English translation as acceptable (not ideal but not the bug)
424+ english_indicators = ["chef" , "restaurant" , "milan" , "italian" , "cooking" ]
425+ has_english = any (word in all_text for word in english_indicators )
426+
427+ assert has_italian or has_english , (
428+ f"Expected facts to be in Italian or English, but got neither. Facts: { all_text } "
429+ )
430+
431+ logger .info ("Italian content test passed - facts not translated to CJK" )
432+
433+ finally :
434+ await memory .delete_bank (bank_id , request_context = request_context )
435+
436+
278437@pytest .mark .asyncio
279438async def test_mixed_language_entities (memory , request_context ):
280439 """
0 commit comments