In [113]:
import json
import os
import re
from typing import List, Tuple

from dotenv import load_dotenv
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, PointStruct, VectorParams
from langchain.storage import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore


load_dotenv()
child_collection_name = os.environ["CHILD_COLLECTION_NAME"]
parent_collection_name = os.environ["PARENT_COLLECTION_NAME"]

"""
This file is responsible for loading a pdf file from hugginface, transforming it to embeded version and inserting as new collection to qdrant database
"""

# [m for m in NVIDIAEmbeddings.get_available_models() if "embed" in m.id]


def ReadPdf(path_to_pdf: str) -> List[Document]:
    """
    This function returns list of langchain documents.
    Loading whole file as one document in order to split it with chunk overlap.
    """
    try:
        loader = PyPDFLoader(file_path=path_to_pdf, mode="page")
        print("📄 Loading PDF file...")
        raw_docs  = loader.load()
        documents = []
        previous_match = []
        for doc in raw_docs:
            text = doc.page_content
            # Match something like "Art. 123"
            match = re.findall(r"\bArt\.\s*(\d+)", text)
            
            # If there is no match we are still on previous article
            article_number = match if match else [previous_match[-1]]
            
            # Add the last article that we ended before
            if len(previous_match) > 0 and previous_match[-1] not in article_number:
                article_number.insert(0,previous_match[-1])
                
            previous_match = article_number

            documents.append(Document(
                page_content=text,
                metadata={"article_number": article_number}
            ))

        return documents
    except ValueError as e:
        print(
            "\033[91mValueError in ReadPdf: Provided path does not lead to a file: \033[0m",
            e,
        )
        return []

In [114]:
doc_splitted = ReadPdf("./data/prawod_wodne.pdf")

📄 Loading PDF file...


In [129]:
retriver.docstore.store.mget(['8e8e52d1-3083-4e20-b200-7713ebd6e493'])

[b'{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "document", "Document"], "kwargs": {"metadata": {"article_number": ["397", "398"]}, "page_content": "\\u00a9Kancelaria Sejmu    s. 350/444 \\n \\n2024-01-24 \\n \\n6. Wniosek o  wydanie pozwolenia wodnoprawnego, oceny wodnoprawnej, \\ndecyzji, o  kt\\u00f3rych mowa w  art. 77 ust. 3 i  8 oraz w  art. 176 ust. 4, a  tak\\u017ce \\nzg\\u0142oszenie wodnoprawne sk\\u0142ada si\\u0119 do nadzoru wodnego w\\u0142a\\u015bciwego miejscowo \\nalbo najbli\\u017cszego dla zamierzonego kor zystania z  us\\u0142ug wodnych lub \\nwykonywania urz\\u0105dze\\u0144 wodnych, lub innej dzia\\u0142alno\\u015bci wymagaj\\u0105cej zgody \\nwodnoprawnej. \\nArt. 397a.  1. Wniosek o wydanie pozwolenia wodnoprawnego, oceny \\nwodnoprawnej, decyzji, o kt\\u00f3rych mowa w art. 77  ust. 3 i 8 oraz w art. 176 ust . 4, \\noraz o udzielenie przyrzeczenia wydania pozwolenia wodnoprawnego, a  tak\\u017ce \\nzg\\u0142oszenie wodnoprawne sk\\u0142ada si