In [4]:
import shutil
import json

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from pathlib import Path
from tqdm import tqdm

In [5]:
with open("data/docs.json", "r") as f:
    loaded_docs_json = json.load(f)

chunks = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in loaded_docs_json
]

print(f"Restored chunks from json: {len(chunks):_}")

Restored chunks from json: 279


In [6]:
embeddings = OllamaEmbeddings(model="nomic-embed-text-v2-moe")

def get_batches(chunks, batch_size=1024):
    for i in range(0, len(chunks), batch_size):
        yield chunks[i:i+batch_size]

In [8]:
print(chunks[100])

page_content='включить учителей и других работников учебных заведений в группы, подлежащие первоочередной вакцинации против covid-19 и ревакцинации;' metadata={'lang': 'ru', 'tr_lang': 'de', 'tr_text': 'lehrer und anderes schulisches personal zu vorrangigen gruppen für erst- und auffrischungsimpfungen gegen covid-19 machen.', 'file_id': 1}


In [9]:
# In memory vector store for experiments, limited chunks[:20_000]

vectorstore = Chroma(
    collection_name="in_memory_docs",  
    embedding_function=embeddings,
    persist_directory=None
)

if vectorstore._collection.count() == 0:
    for batch in tqdm(get_batches(chunks[:20_000]), desc="Adding documents", unit="batch"):
        vectorstore.add_documents(batch)

total_docs = vectorstore._collection.count()
print(f"Created In-Memory vectorstore. Found {total_docs} documents.")

Adding documents: 1batch [00:03,  3.65s/batch]

Created In-Memory vectorstore. Found 279 documents.





In [10]:
# Persistent vector db, it takes ~ 30 minutes (on my MacBook Pro M4) to add 232588 chunks

db_name = 'translation_db'
db_path = Path(db_name)

vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

total_docs = vectorstore._collection.count()
if total_docs == 0:
    for batch in tqdm(get_batches(chunks), desc="Adding documents", unit="batch"):
        vectorstore.add_documents(batch)
    print(f"Added {vectorstore._collection.count()} documents.")

total_docs = vectorstore._collection.count()
print(f"Found {total_docs} documents in Chroma vectorstore `{db_name}`")

Found 461506 documents in Chroma vectorstore `translation_db`


In [11]:
# After destroying it is required to restart the Kernel to create a new db with the same path
db_name = 'translation_db'
def destroy_persistent_vector_store():
    shutil.rmtree(db_name, ignore_errors=True)
# destroy_persistent_vector_store()

In [12]:
def metadata_to_string(metadata):
    s = ""
    for key in sorted(metadata.keys()):
        s += f"{key}: {metadata[key]}\n"
    return s

In [13]:
with open("data/file_mapping.json", "r") as f:
    file_mapping = json.load(f)

In [14]:
query = "Когда же восходящее солнце пробило туман и осветило мертвеца"
results = vectorstore.similarity_search_with_score(
    query=query, 
    k=3
    # filter={"lang":'ru'}
)

for doc, score in results:
    print(f"""[Score={score:3f}]\n{doc.page_content}\n{metadata_to_string(doc.metadata)}""")
    file_path = [f["path"] for f in file_mapping if f["id"] == doc.metadata["file_id"]]
    print(f"source: {file_path}\n\n")
    

[Score=0.193380]
Когда же восходящее солнце пробило туман и осветило мертвеца, ему стало не по себе.
file_id: 19
lang: ru
tr_lang: de
tr_text: Als die aufgehende Sonne durch den Nebel brach und den Toten beschien, war ihm das unangenehm.

source: []


[Score=0.771460]
Она исчезла, как исчезает туман под лучами восходящего солнца.
file_id: 25207
lang: ru
tr_lang: de
tr_text: Verflüchtigt, wie Nebel in der steigenden Sonne.

source: []


[Score=0.794044]
Улетучилась, как утренний туман под лучами восходящего солнца.
file_id: 101995
lang: ru
tr_lang: de
tr_text: Verflüchtigt, wie Nebel in der steigenden Sonne.

source: []


