In [None]:
!pip install langchain[all]
!pip install sentence-transformers
!pip install chromadb
!pip install streamlit
!pip install openai
!pip install pypdf
!pip install python-dotenv
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

from transformers import AutoTokenizer
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.vectorstores.pgvector import PGVector

SOURCE_DOCUMENTS = ["/content/drive/MyDrive/GarageSber/documents/osnovy_prava_isakov_vb.pdf"]
COLLECTION_NAME = "doc_index"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"


def main():
    print("Ingesting...")
    all_docs = ingest_docs(SOURCE_DOCUMENTS)
    print("Persisting...")
    db = generate_embed_index(all_docs)
    print("Done.")


def ingest_docs(source_documents):
    all_docs = []
    for source_doc in source_documents:
        print(source_doc)
        docs = pdf_to_chunks(source_doc)
        all_docs = all_docs + docs
    return all_docs


def pdf_to_chunks(pdf_file):
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        separators=["\n \n", "\n\n", "\n", " ", ""],
        chunk_size=512,
        chunk_overlap=0,
    )
    loader = PyPDFLoader(pdf_file)
    docs = loader.load_and_split(text_splitter)
    return docs


def generate_embed_index(docs):
    chroma_persist_dir = "./DBdir"
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = create_index_chroma(docs, embeddings, chroma_persist_dir)
    return db


def create_index_chroma(docs, embeddings, persist_dir):
    db = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=COLLECTION_NAME,
        persist_directory=persist_dir,
    )
    db.persist()
    return db

In [None]:
main()

Ingesting...
/content/drive/MyDrive/GarageSber/documents/osnovy_prava_isakov_vb.pdf
Persisting...
Done.


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.vectorstores.pgvector import PGVector

COLLECTION_NAME = "doc_index"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"


def main():
    # Same model as used to create persisted embedding index
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

    # Access persisted embeddings
    db = get_embed_db(embeddings)

    # Example query to for similarity indexing
    prompt = (
        "Какие норма права действуют при задержании"
    )

    # Display matched documents and similarity scores
    print(f"Finding document matches for '{prompt}'")
    docs_scores = db.similarity_search_with_score(prompt)
    for doc, score in docs_scores:
        print(f"\nSimilarity score (lower is better): {score}")
        print(doc.metadata)
        print(doc.page_content)


def get_embed_db(embeddings):
    chroma_persist_dir = "./DBdir"
    db = get_chroma_db(embeddings, chroma_persist_dir)
    return db



def get_chroma_db(embeddings, persist_dir):
    db = Chroma(
        embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        persist_directory=persist_dir,
    )
    return db

In [None]:
main()

Finding document matches for 'Какие норма права действуют при задержании'

Similarity score (lower is better): 0.6336705684661865
{'author': 'V2', 'creationdate': '2014-08-12T13:30:01+04:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2017-02-09T10:03:36+03:00', 'page': 207, 'page_label': '208', 'producer': 'Acrobat Distiller 8.3.1 (Windows)', 'source': '/content/drive/MyDrive/GarageSber/documents/osnovy_prava_isakov_vb.pdf', 'title': '<313334325FC8F1E0EAEEE25FCEF1EDEEE2FB20EFF0E0E2E05F556368652E2E2E>', 'total_pages': 480}
норм права
Правоотношения
Признаки правоотношений
Интеллектуальнаякарта11Д01

Similarity score (lower is better): 0.6420127153396606
{'author': 'V2', 'creationdate': '2014-08-12T13:30:01+04:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2017-02-09T10:03:36+03:00', 'page': 305, 'page_label': '306', 'producer': 'Acrobat Distiller 8.3.1 (Windows)', 'source': '/content/drive/MyDrive/GarageSber/documents/osnovy_prava_isakov_vb.pdf', 'title': '<3133343