In [1]:
import xml.etree.ElementTree as ET
import shutil
import os
import random

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from pathlib import Path
from tqdm import tqdm

In [2]:
def parse_tmx(path):
    tree = ET.parse(path)
    root = tree.getroot()
    result = []
    
    for tu in root.findall(".//tu"):
        content = []

        tuvs = tu.findall("tuv")
        if len(tuvs) != 2:
            print(f"Warning: {len(tuvs)} tuvs")
            
        for tuv in tu.findall("tuv"):
            lang = tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
            seg = tuv.find("seg")
            if seg is not None and seg.text is not None:
                content.append({"lang": lang, "text": seg.text})
        
        if len(content) == 2:
            result.append(content)
        else:
            pass
            # print(f"Skip file: {path}. \nUnexpected content size: {len(content)}")
    
    return result

In [3]:
def create_documents_from_xml(root_dir):
    documents = []

    for dirpath, dirnames, filenames in os.walk(root_dir):
        filtered_files = [f for f in filenames if f.lower().endswith('.tmx')]
        if not filtered_files:
            continue

        rel_path = os.path.relpath(dirpath, root_dir)
        if rel_path == ".":
            folder_hierarchy = []
        else:
            folder_hierarchy = rel_path.split(os.sep)

        for xml_file in filtered_files:
            file_path = os.path.join(dirpath, xml_file)
            try:
                content_list = parse_tmx(file_path)
                for content in content_list:
                    try:
                        doc = {
                            "page_content": content[0]["text"], 
                            "metadata": {
                                "lang": content[0]["lang"],
                                "file_path": file_path,
                                "translation_lang": content[1]["lang"],
                                "translation_text": content[1]["text"],
                                "lang_dir": folder_hierarchy[0],
                                "folders": ",".join(folder_hierarchy[1:])
                            }}
                        documents.append(doc)
                    except Exception as e:
                        print(f"Failed to create doc\nFile: {file_path}\n with content\n: {content}")
                        print(f"Error: {e}\n\n")

            except ET.ParseError as pe:
                print(f"Failed to process '{file_path}': {pe}\n")

    return documents

In [4]:
doc_list = create_documents_from_xml("documents")


In [None]:
docs_filtered = [d for d in doc_list if len(d["page_content"]) < 4]
print([(d['page_content'], d['metadata']['translation_text']) for d in docs_filtered])

print(len(docs_filtered))
if len(docs_filtered) > 0:
    doc = random.sample(docs_filtered, 1)[0]
    print(f"{doc['page_content']}\n{doc['metadata']}")

In [5]:
documents = [
    Document(
        page_content=doc_item["page_content"], 
        metadata={
            "lang": doc_item["metadata"]["lang"],
            "file_path": doc_item["metadata"]["file_path"],
            "translation_lang": doc_item["metadata"]["translation_lang"],
            "translation_text": doc_item["metadata"]["translation_text"],
            "lang_dir": doc_item["metadata"]["lang_dir"],
            "folders": doc_item["metadata"]["folders"]
        }) for doc_item in doc_list]

print(f"Created {len(documents)} documents.")

Created 231377 documents.


In [None]:
for d in documents[200_000:200_001]:
    print(d.page_content)
    print(d.metadata)
    print("\n")

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)
print(f"From all {len(documents)} documents {len(chunks)} chunks created.")

From all 231377 documents 232588 chunks created.


In [7]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

def get_batches(chunks, batch_size=512):
    for i in range(0, len(chunks), batch_size):
        yield chunks[i:i+batch_size]

In [None]:
# In memory vector store for experiments, limited chunks[:20_000]

vectorstore = Chroma(
    collection_name="in_memory_docs",  
    embedding_function=embeddings,
    persist_directory=None
)

if vectorstore._collection.count() == 0:
    for batch in tqdm(get_batches(chunks[:20_000]), desc="Adding documents", unit="batch"):
        vectorstore.add_documents(batch)

total_docs = vectorstore._collection.count()
print(f"Created In-Memory vectorstore. Found {total_docs} documents.")

In [8]:
# Persistent vector db, it takes ~ 30 minutes (on my MacBook Pro M4) to add 232588 chunks

db_name = 'translation_db'
db_path = Path(db_name)

vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

total_docs = vectorstore._collection.count()
if total_docs == 0:
    for batch in tqdm(get_batches(chunks), desc="Adding documents", unit="batch"):
        vectorstore.add_documents(batch)
    print(f"Added {vectorstore._collection.count()} documents.")

total_docs = vectorstore._collection.count()
print(f"Found {total_docs} documents in Chroma vectorstore `{db_name}`")

Adding documents: 455batch [24:15,  3.20s/batch]

Added 232588 documents.
Found 232588 documents in Chroma vectorstore `translation_db`





In [None]:
# After destroying it is required to restart the Kernel to create a new db with the same path
def destroy_persistent_vector_store():
    shutil.rmtree(db_name, ignore_errors=True)
destroy_persistent_vector_store()

In [10]:
def metadata_to_string(metadata):
    s = ""
    for key in sorted(metadata.keys()):
        s += f"{key}: {metadata[key]}\n"
    return s

In [13]:
query = "Документы должны быть засекречены третьими лицами"
results = vectorstore.similarity_search_with_score(
    query=query, 
    k=3,
    filter={"lang": "ru"}
)

for doc, score in results:
    print(f"""[Score={score:3f}]\n{doc.page_content}\n{metadata_to_string(doc.metadata)}\n\n""")
    

[Score=0.170698]
Уставом общества может быть запрещено отчуждение долей третьим лицам.
file_path: documents/RU-DE/special language/official-business/Economics/Инвестиции в Смоленской области ОД  .tmx
folders: special language,official-business,Economics
lang: ru
lang_dir: RU-DE
translation_lang: de
translation_text: Das Recht zur Übertragung (einschließlich des Verkaufs) von Anteilen an dritte Personen kann durch die Satzung ausgeschlossen werden.



[Score=0.171978]
Такому должностному лицу (лицам) может быть предписано частично или полностью возместить потери Организации Объединенных Наций.
file_path: documents/RU-DE/special language/official-business/UN/Финансовые положения и правила Организации Объединенных Наций ОД С.tmx
folders: special language,official-business,UN
lang: ru
lang_dir: RU-DE
translation_lang: de
translation_text: Ist dies der Fall, so kann von dem Betroffenen verlangt werden, den Vereinten Nationen den Verlust teilweise oder in voller Höhe zu erstatten.



[Scor