In [48]:
import chromadb
import pymupdf4llm
from langchain_text_splitters import RecursiveCharacterTextSplitter

from util.pdf_to_text import clean_text

text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500,
                                               separators=["\n\n", "\n", ". ", " ", ""], )
client = chromadb.Client()

client.delete_collection("Standardni-operativni-postupci-u-zdravstvenoj-njezi-1")
collection = client.get_or_create_collection("Standardni-operativni-postupci-u-zdravstvenoj-njezi-1",
                                             metadata={"hnsw:space": "cosine"})
# Convert PDF to Markdown
pages = pymupdf4llm.to_markdown("Standardni-operativni-postupci-u-zdravstvenoj-njezi-1.pdf")
# Clean the text using the clean_text function
full_text = clean_text(pages)
##full_text = pages

# Create chunks using MarkdownTextSplitter on the entire cleaned full_text
chunks = text_splitter.create_documents([full_text])

print(len(chunks))

Processing Standardni-operativni-postupci-u-zdravstvenoj-njezi-1.pdf...
[Document(metadata={}, page_content='HRVATSKA KOMORA MEDICINSKIH SESTARA Zagreb, prosinac, 2022. Izdavač: HRVATSKA KOMORA MEDICINSKIH SESTARA STANDARDNI OPERATIVNI POSTUPCI U ZDRAVSTVENOJ NJEZI Urednik: Mario Gazić, mag. med. techn. Autori: Željka Benceković, uni. mag. admin. sanit, mag. med techn. Ivica Benko, dipl. med.techn., ECDSAP Marija Bukvić, prof. reh., mag. med. techn. Doc. dr. sc. Sonja Kalauz, prof. v. š. Dr. sc. Vesna Konjevoda, mag. med. techn. Marija Milić, univ. mag. admin. sanit., dipl. med. techn. Recenzenti: Doc. dr. sc. Sandra Bošković, prof. reh., bacc.med.teh., izv.prof.dr.sc. Robert Lovrić, mag. med. techn. Izv.prof.dr.sc. Marijana Neuberg, mag. med. techn. Lektori: Zrinka Šućur, profesorica hrvatskog jezika i književnosti Za izdavača: Mario Gazić, mag. med. techn. Izdanje: II izdanje (izmjene i dopune: Standardizirani postupci u zdravstvenoj njezi. Zagreb: Hrvatska komora medicinskih sestara

In [53]:
print(len(chunks))

149


In [52]:
# Required imports
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer

token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0,
    tokens_per_chunk=384,
    model_name="sentence-transformers/all-mpnet-base-v2"
)

ids = []
token_split_texts = []
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# Process each text chunk
for page_index, text in enumerate(chunks):
    # Split the text content of each page
    split_texts = token_splitter.split_text(text.page_content)
    token_split_texts.extend(split_texts)  # Add split texts to the main list
    ids.extend([f"{page_index}_{i}" for i in range(len(split_texts))])  # Unique IDs for each chunk

# Generate embeddings for each split text chunk
embeddings = model.encode(token_split_texts)

# Add split texts, ids, and embeddings to the Chroma collection
collection.add(
    ids=ids,  # Unique identifiers for each chunk
    documents=token_split_texts,  # The actual text chunks
    embeddings=embeddings.tolist()  # Generated embeddings for each chunk
)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Batches: 100%|██████████| 25/25 [03:20<00:00,  8.00s/it]


In [60]:
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import \
    SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-mpnet-base-v2")
embed_query = embedding_function("Upis i identifikacija bolesnika prilikom prijma u bolnicu")
result = collection.query(embed_query, n_results=1)

print(result)


Batches: 100%|██████████| 2/2 [00:00<00:00, 13.61it/s]


{'ids': [['46_2'], ['126_0'], ['19_5'], ['9_1'], ['36_4'], ['19_5'], ['36_4'], ['19_5'], ['25_2'], ['36_4'], ['25_3'], ['65_3'], ['19_5'], ['46_2'], ['19_5'], ['147_1'], ['25_2'], ['25_2'], ['19_5'], ['1_8'], ['25_2'], ['36_4'], ['7_2'], ['117_1'], ['9_2'], ['36_4'], ['9_1'], ['25_3'], ['19_5'], ['147_1'], ['25_2'], ['36_4'], ['126_0'], ['70_2'], ['19_5'], ['9_2'], ['19_5'], ['147_1'], ['117_1'], ['109_2'], ['36_4'], ['126_0'], ['70_2'], ['19_5'], ['1_8'], ['109_2'], ['25_2'], ['36_4'], ['46_2'], ['36_4'], ['7_2'], ['117_1'], ['9_2'], ['25_3'], ['19_5'], ['25_2'], ['46_2']], 'embeddings': None, 'documents': [['##stvo za masazu treba nanijeti na trticu i gluteuse te ga laganim polukruznim ili kruznim _ _ pokretima utrljavati u kozu. _ • ukloniti zastitnu kompresu ili rucnik ispod bolesnika i vratiti bolesnika u prvobitan polozaj • prekriti bolesnika plahtom, a ruke ostaviti slobodne • staviti zastitnu kompresu ispod jedne ruke • nanijeti sredstvo za masazu na ruku i lakat te ga laganim 