In [1]:
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import TypedDict

import torch
from dotenv import load_dotenv
from FlagEmbedding import BGEM3FlagModel
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFacePipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.constants import END, START
from langgraph.graph import StateGraph
from qdrant_client import QdrantClient
from qdrant_client import models as qdrant_models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv(dotenv_path="../.env.dev", override=True)
QDRANT_API_KEY = os.getenv("QDRANT__SERVICE__API_KEY", "")

In [11]:
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"


@dataclass
class ChunkingConfig:
    chunk_size: int = 1024
    chunk_overlap: int = 256


@dataclass
class QdrantConfig:
    host: str = "localhost"
    port: int = 6333
    collection_name: str = "clinical_recs"
    rebuild_collection: bool = True
    vector_size: int = 1024
    distance = qdrant_models.Distance.COSINE
    api_key: str = QDRANT_API_KEY
    https: bool = False
    uploading_batch_size: int = 250


@dataclass
class EmbeddingConfig:
    model_name: str = "BAAI/bge-m3"
    batch_size: int = 128
    use_fp16: bool = True
    normalize_embeddings: bool = True
    max_length: int = 512
    return_dense: bool = True
    return_sparse: bool = False


@dataclass
class RetrievalConfig:
    top_k: int = 5
    overfetch_k: int = 5


@dataclass
class LLMConfig:
    model_name: str = "Qwen/Qwen2.5-3B-Instruct"
    dtype: torch.dtype = torch.float16
    max_new_tokens: int = 512
    temperature: float = 0.3
    top_p: float = 0.8
    do_sample: bool = True
    repetition_penalty: float = 1.2


@dataclass
class RAGConfig:
    name: str = "baseline"
    chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
    retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
    llm: LLMConfig = field(default_factory=LLMConfig)
    embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
    qdrant_config: QdrantConfig = field(default_factory=QdrantConfig)

In [12]:
def get_embedder(config: RAGConfig) -> BGEM3FlagModel:
    """
    Initializes and returns a model BGE-m3 (Boundary General Embedding m3) for extracting text
    embeddings.

    Args:
        config (RAGConfig): configuration dataclass.

    Returns:
        BGEM3FlagModel: Initialized embedder model, ready for use.
            - .encode(texts: list[str]) -> Embeddings vectors

    Example:
        >>> config = RAGConfig()
        >>> embedder = get_embedder(config)
        >>> texts = ["First document", "Second document"]
        >>> embeddings = embedder.encode(texts)

    Notes:
        - The BGE-m3 produces 1024 embeddings
        - Use FP16 for memory efficiency
        - Embedding normalization is required for similarity matching with cosine similarity
    """
    embedder = BGEM3FlagModel(
        model_name_or_path=config.embedding.model_name,
        use_fp16=config.embedding.use_fp16,
        normalize_embeddings=config.embedding.normalize_embeddings,
        batch_size=config.embedding.batch_size,
        devices=DEVICE,
        return_dense=config.embedding.return_dense,
        return_sparse=config.embedding.return_sparse,
    )
    embedder.model.model.to(DEVICE)
    return embedder


def get_llm_pipeline(config: RAGConfig):
    """
    Initializes and returns the LLM pipeline for text generation.

    Args:
        config (RAGConfig): configuration dataclass.

    Returns:
        HuggingFacePipeline: Initialized pipeline for text generation.
            - .invoke(prompt: str) -> str: Generates a response to a prompt
            - .batch([prompts]) -> list[str]: Batch-generated responses

    Example:
        >>> config = RAGConfig()
        >>> pipeline = get_llm_pipeline()
        >>> response = pipeline.invoke(
        ...     "What is machine learning?"
        ... )
    """
    return HuggingFacePipeline.from_model_id(
        model_id=config.llm.model_name,
        task="text-generation",
        model_kwargs={
            "dtype": config.llm.dtype,
            "device_map": DEVICE,
            "trust_remote_code": False,
            "attn_implementation": "eager",
        },
        pipeline_kwargs={
            "max_new_tokens": config.llm.max_new_tokens,
            "temperature": config.llm.temperature,
            "top_p": config.llm.top_p,
            "do_sample": config.llm.do_sample,
            "repetition_penalty": config.llm.repetition_penalty,
        },
    )

In [None]:
class QdrantManager:
    """
    Manages vector database operations with Qdrant for RAG systems.

    This class handles the complete lifecycle of managing documents in a Qdrant vector database:
    - Connection management and collection creation
    - Document loading, chunking, and embedding
    - Batch indexing of embeddings into Qdrant
    - Semantic search and document retrieval

    Notes:
        - Vectors are stored on disk (not in RAM)
        - Batch processing prevents OOM errors during indexing
        - Uses cosine distance metric for similarity search
    """

    def __init__(self, config: RAGConfig, embedder: BGEM3FlagModel):
        """
        Initialize the QdrantManager with configuration and embedder.

            Args:
                config (RAGConfig): Main RAG configuration dataclass
                    - config.chunking: Document chunking parameters (size, overlap)
                    - config.retrieval: Retrieval parameters (top_k)
                    - config.qdrant_config: Qdrant connection settings:
                        * host (str): Qdrant server host (default: "localhost")
                        * port (int): Qdrant server port (default: 6333)
                        * api_key (Optional[str]): API key for authentication
                        * https (bool): Use HTTPS for connection
                        * collection_name (str): Name of collection to create
                        * vector_size (int): Embedding dimension (1024 for BGE-m3)
                        * distance (str): Distance metric ("Cosine", "Euclidean", or "Manhattan")
                        * rebuild_collection (bool): Delete and recreate collection if exists
                        * uploading_batch_size (int): Batch size multiplier (default: 1)
                    - config.embedding: Embedding settings (max_length, batch_size)

                embedder (BGEM3FlagModel): Initialized BGE-m3 embedder model for converting texts
                    to embeddings. Must be already loaded via get_embedder().

            Notes:
                - Does NOT connect to Qdrant immediately (lazy connection)
                - Does NOT create collection (call _create_collection() or setup_qdrant())
                - Embedder must be already initialized
        """
        self.chunking_cfg = config.chunking
        self.retrieval_cfg = config.retrieval
        self.qdrant_cfg = config.qdrant_config
        self.qdrant_client = QdrantClient(
            host=self.qdrant_cfg.host,
            port=self.qdrant_cfg.port,
            api_key=self.qdrant_cfg.api_key,
            https=self.qdrant_cfg.https,
        )

        self.embedder = embedder
        self.max_length = config.embedding.max_length

        self.uploading_batch_size = config.qdrant_config.uploading_batch_size

    @staticmethod
    def sanitize_text(text):
        """Prevent encoding errors."""
        return text.encode("utf-8", "ignore").decode("utf-8", "ignore")

    def _create_collection(self):
        """
        Create or verify a Qdrant collection for storing embeddings.

        Performs the following operations:
        1. Verifies connection to Qdrant server
        2. Optionally deletes existing collection (if rebuild_collection=True)
        3. Creates new collection with configured parameters
        4. Verifies collection was created successfully

        Args:
            None

        Returns:
            None

        Notes:
            - Collection name comes from config.qdrant_config.collection_name
            - If rebuild_collection=True and collection exists, it will be deleted
            - Vectors stored on disk to handle large document sets
        """
        # try qdrant connection
        try:
            health = self.qdrant_client.get_collections()
            print(f"✓ Qdrant is healthy. Collections: {len(health.collections)}")
        except Exception as e:
            raise RuntimeError(f"Cannot connect to Qdrant: {e}")  # noqa: B904

        # delete existing collection if it exists and rebuild_collection = True
        if (
            self.qdrant_client.collection_exists(self.qdrant_cfg.collection_name)
            and self.qdrant_cfg.rebuild_collection
        ):
            self.qdrant_client.delete_collection(self.qdrant_cfg.collection_name)
            print(f"Deleted existing collection '{self.qdrant_cfg.collection_name}'")

        # Create new collection if does not exist
        if not self.qdrant_client.collection_exists(self.qdrant_cfg.collection_name):
            self.qdrant_client.create_collection(
                collection_name=self.qdrant_cfg.collection_name,
                vectors_config=qdrant_models.VectorParams(
                    size=self.qdrant_cfg.vector_size,
                    distance=self.qdrant_cfg.distance,
                    on_disk=True,  # Store collection on SSD
                ),
            )

        print(f"✓ Created collection '{self.qdrant_cfg.collection_name}'")

    def __create_chunks(self, data_dir: Path):
        """
        Load PDF documents from directory and split them into chunks.

        Private method that handles document loading and chunking:
        1. Loads all PDF files from directory using PyPDFDirectoryLoader
        2. Splits documents using RecursiveCharacterTextSplitter
        3. Preserves metadata (source filename) for each chunk

        Args:
            data_dir (Path): Directory containing PDF files to index.
                All PDFs in directory and subdirectories will be processed.

        Returns:
            List[Document]: List of LangChain Document objects, each containing:
                - page_content (str): Actual text chunk
                - metadata (dict): Source filename and page number

        Notes:
            - Errors in individual PDFs are silently skipped (silent_errors=True)
            - Uses "layout" extraction mode for better structure preservation
            - Metadata includes source filename for citation purposes
        """
        loader = PyPDFDirectoryLoader(
            path=data_dir, mode="single", silent_errors=True, extraction_mode="layout"
        )

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunking_cfg.chunk_size, chunk_overlap=self.chunking_cfg.chunk_overlap
        )

        documents = loader.load()
        print(f"Loaded {len(documents)} documents")
        return splitter.split_documents(documents)

    def _index_chunks(self, data_dir: Path):
        """
        Load chunks from PDFs and index them into Qdrant using batch processing.

        Complete indexing pipeline:
        1. Creates chunks from PDF files in data_dir
        2. Embeds chunks using BGE-m3 in batches
        3. Creates Qdrant points with embeddings and metadata
        4. Uploads points to Qdrant in batches
        5. Clears GPU memory after completion

        Batch Processing Strategy:
            - Large batches (embedder.batch_size * uploading_batch_size)
            - Prevents OOM errors during indexing

        Args:
            data_dir (Path): Directory containing PDF files to index.
                Must contain at least one PDF file.

        Returns:
            None

        Notes:
            - Chunking happens in __create_chunks() (private)
            - Embeddings are computed once and reused
            - Each point gets sequential ID from 0 to num_chunks
            - Metadata stored: text content and source filename
        """
        chunks = self.__create_chunks(data_dir)
        print(f"Indexing {len(chunks)} chunks into Qdrant...")

        texts = [self.sanitize_text(chunk.page_content) for chunk in chunks]

        batch_size = self.embedder.batch_size * self.uploading_batch_size
        num_batches = len(texts) // batch_size + 1
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, len(texts))

            try:
                embeddings = self.embedder.encode(
                    sentences=texts[start_idx:end_idx], max_length=self.max_length
                )["dense_vecs"]
            except TypeError as e:
                print(f"Error via embed chunks: {e}")

            points = []
            for i in range(len(embeddings)):  # type: ignore
                points.append(
                    qdrant_models.PointStruct(
                        id=start_idx + i,
                        vector=embeddings[i],  # type: ignore
                        payload={
                            "text": chunks[start_idx + i].page_content,
                            "source": chunks[start_idx + i].metadata["source"],
                        },
                    )
                )

            self.qdrant_client.upload_points(
                collection_name=self.qdrant_cfg.collection_name,
                batch_size=256,
                points=points,
                wait=True,
            )

            print(f"Indexed {end_idx} chunks")

        torch.mps.empty_cache()
        print("Indexed all chunks into Qdrant")

    def setup_qdrant(self, data_dir: Path):
        """
        Complete setup pipeline: create collection and index all documents.

        High-level method that orchestrates the entire setup process:
        1. Creates Qdrant collection with configured parameters
        2. Loads and chunks documents from directory
        3. Embeds chunks and uploads to Qdrant
        4. Verifies setup is complete

        Args:
            data_dir (Path): Directory containing PDF documents to index.
                Should contain clinical guideline PDFs or similar documents.
                Example: Path("documents/clinical_guidelines/")

        Returns:
            None

        Example:
            >>> manager = QdrantManager(config, embedder)
            >>> manager.setup_qdrant(Path("clinical_docs/"))

        Notes:
            - This is typically called once during RAG system initialization
            - Subsequent calls with rebuild_collection=False will skip creation

        Related Methods:
            - _create_collection(): Creates Qdrant collection
            - _index_chunks(): Embeds and indexes documents
            - retrieve_documents(): Query indexed documents
        """
        if self.qdrant_cfg.rebuild_collection:
            self._create_collection()
            self._index_chunks(data_dir)
        print("Qdrant database is ready for working with RAG")

    def retrieve_documents(self, query: str) -> list[Document]:
        """
        Search for relevant documents matching the query using semantic similarity.

        Retrieval Pipeline:
        1. Embeds the query using BGE-m3
        2. Performs vector similarity search in Qdrant
        3. Retrieves top-k most similar documents with metadata
        4. Converts results to LangChain Document objects

        Args:
            query (str): Natural language question or search query.
                Can be in any language supported by BGE-m3 (111 languages).

        Returns:
            List[Document]: List of most relevant documents (up to top_k).
                Each Document contains:
                - page_content (str): Text chunk from indexed documents
                - metadata (dict): Source filename and similarity score
                Sorted by relevance (highest score first)

        Example:
            >>> query = "What is the recommended treatment for hypertension?"
            >>> docs = manager.retrieve_documents(query)
            >>> for doc in docs:
            >>>     print(f"Score: {doc.metadata['score']:.4f}")
            >>>     print(f"Source: {doc.metadata['source']}")
            >>>     print(f"Content: {doc.page_content[:100]}...")

        Notes:
            - Results limited to config.retrieval.top_k (default: 5)
            - Scores stored in metadata for ranking/filtering
        """
        query_vec = self.embedder.encode_queries(query)["dense_vecs"]
        search_results = self.qdrant_client.query_points(
            collection_name="clinical_recs",
            query=query_vec,  # type: ignore
            with_payload=["text", "source"],
            limit=self.retrieval_cfg.top_k,
        )

        # Convert to LangChain Documents and return
        return [
            Document(
                page_content=res.payload["text"],  # type: ignore
                metadata={
                    "source": res.payload["source"],  # type: ignore
                    "score": res.score,
                },
            )
            for res in search_results.points
        ]

In [None]:
class RAGState(TypedDict):
    """State for LangGraph"""

    query: str
    retrieved_docs: list
    context: str
    answer: str


class RAGPipeline:
    """
    End-to-end Retrieval-Augmented Generation (RAG) pipeline using LangGraph.

    This class orchestrates a complete RAG system that:
    1. Retrieves relevant documents from Qdrant based on user query
    2. Prepares context by combining retrieved documents
    3. Generates an answer using LLM with retrieved context
    4. Returns final answer with document citations

    Pipeline Flow:
        Query
          ↓
        [RETRIEVE] → Search Qdrant for relevant documents
          ↓
        [CONTEXT] → Combine retrieved docs into context
          ↓
        [GENERATE] → Generate answer using LLM + context
          ↓
        Answer (with document citations)

    Attributes:
        llm_pipe (HuggingFacePipeline): Language model pipeline for answer generation.
            Typically Qwen 2.5 3B Instruct or similar.
        qdrant_manager (QdrantManager): Qdrant vector database manager for document retrieval.
        _grapg (StateGraph): already compiled graph instance

    Notes:
        - Graph is rebuilt on each run (can be optimized)
        - Answer generation includes system prompt in Russian
        - Document citations are included in output
    """

    def __init__(self, llm_pipe: HuggingFacePipeline, qdrant_manager: QdrantManager):
        """
        Initialize RAG pipeline with LLM and vector database.

        Creates an instance of RAGPipeline with required components.
        Does NOT build the LangGraph immediately (lazy initialization).

        Args:
            llm_pipe (HuggingFacePipeline): Initialized language model pipeline.
                Should be text-generation model capable of following instructions.
                Must support:
                    - .invoke(prompt: str) → str
                    - Instruction-following capability
            qdrant_manager (QdrantManager): Initialized Qdrant manager with indexed documents.
                Must have:
                    - setup_qdrant() already called
                    - Collection populated with document embeddings
                    - BGE-m3 embedder configured
                    - Ability to retrieve_documents(query: str)

        Notes:
            - Both components must be pre-initialized
            - Graph is built dynamically on first run() call
        """
        self.llm_pipe = llm_pipe
        self.qdrant_manager = qdrant_manager
        self._graph = None

    def retrieve_node(self, state: RAGState) -> RAGState:
        """
        Retrieve relevant documents from Qdrant based on query.

        LangGraph node that performs semantic search:
        1. Extracts query from state
        2. Searches Qdrant for relevant documents
        3. Logs retrieval results
        4. Stores retrieved documents in state

        Args:
            state (RAGState): Current pipeline state containing:
                - query (str): User's question or search query
                - retrieved_docs (list): Previously retrieved docs (empty on first call)
                - context (str): Prepared context (empty on first call)
                - answer (str): Generated answer (empty on first call)

        Returns:
            RAGState: Updated state with:
                - retrieved_docs: List of Document objects from Qdrant

        Notes:
            - Retrieval uses cosine similarity (BGE-m3)
            - Number of docs returned: config.retrieval.top_k (default: 5)
        """
        print(f"[RETRIEVE] Query: {state['query']}")

        docs = self.qdrant_manager.retrieve_documents(state["query"])

        print(f"Retrieved {len(docs)} documents")
        for i, doc in enumerate(docs, 1):
            source = doc.metadata.get("source", "unknown")
            print(f"  {i}. {source}")

        state["retrieved_docs"] = docs
        return state

    def context_node(self, state: RAGState) -> RAGState:
        """
        Prepare context by combining retrieved documents.

        LangGraph node that processes retrieved documents:
        1. Iterates through retrieved documents
        2. Formats each document with source information
        3. Combines all documents into single context string
        4. Stores context in state for LLM

        Args:
            state (RAGState): Pipeline state containing:
                - query: Unchanged from user
                - retrieved_docs: Documents from retrieve_node
                - context: Will be overwritten
                - answer: Unchanged (not used yet)

        Returns:
            RAGState: Updated state with:
                - context: Formatted string combining all documents

        Notes:
            - Documents separated by "---" delimiter
            - Source filename included in document header
            - Original formatting preserved from documents
        """
        print(f"\n[CONTEXT] Preparing context from {len(state['retrieved_docs'])} documents")

        # Объединяем документы в контекст
        context_parts = []
        for i, doc in enumerate(state["retrieved_docs"], 1):
            context_parts.append(f"[Document {i} - {doc.metadata['source']}]\n{doc.page_content}")

        context = "\n\n---\n\n".join(context_parts)
        state["context"] = context

        print(f"Context prepared ({len(context)} chars)")

        return state

    def generate_node(self, state: RAGState) -> RAGState:
        """
        Generate answer using LLM with retrieved context.

        LangGraph node that performs answer generation:
        1. Creates system prompt (instructions for medical assistant)
        2. Constructs full prompt with context and question
        3. Calls LLM to generate answer
        4. Stores answer in state


        Args:
            state (RAGState): Pipeline state containing:
                - query: User's original question
                - context: Prepared context from context_node
                - retrieved_docs: Available for reference
                - answer: Will be overwritten with LLM output

        Returns:
            RAGState: Updated state with:
                - answer: Generated response from LLM

        Notes:
            - Generation language: Russian (system prompt)
        """
        print("[GENERATE] Generating answer...")

        system_message = """Вы — отзывчивый медицинский ассистент.
        На основании предоставленных медицинских документов ответьте на вопрос пользователя точно и
        кратко.
        Если информация отсутствует в документах, четко укажите это.
        Приведите ссылки на документы."""

        prompt = f"""{system_message}
        Documents:
        {state["context"]}
        Question: {state["query"]}
        Answer:"""

        state["answer"] = self.llm_pipe.invoke(prompt)

        print(f"Generated {len(state['answer'].split())} words")

        return state

    def _build_graph(self):
        """
        Build LangGraph workflow with nodes and edges.

        Constructs the computation graph that defines the RAG pipeline:
        1. Creates StateGraph with RAGState schema
        2. Adds three processing nodes (retrieve, context, generate)
        3. Defines edges connecting nodes
        4. Compiles graph for execution

        Graph Structure:
            START
              ↓
            [retrieve]  - Search documents
              ↓
            [context]   - Prepare context
              ↓
            [generate]  - Generate answer
              ↓
             END

        Args:
            None

        Returns:
            CompiledGraph: Compiled LangGraph ready for execution.
                - Immutable workflow definition
                - Callable with graph.invoke(state)

        Nodes:
            1. retrieve: retrieve_node(state) → state
               - Queries Qdrant for relevant documents
               - Populates: state["retrieved_docs"]
            2. context: context_node(state) → state
               - Combines retrieved documents
               - Populates: state["context"]
            3. generate: generate_node(state) → state
               - Generates answer with LLM
               - Populates: state["answer"]

        Edges:
            - START → retrieve: Entry point
            - retrieve → context: Sequential flow
            - context → generate: Sequential flow
            - generate → END: Exit point
        """
        print("\n[GRAPH] Building LangGraph...")

        workflow = StateGraph(RAGState)

        workflow.add_node("retrieve", self.retrieve_node)
        workflow.add_node("context", self.context_node)
        workflow.add_node("generate", self.generate_node)

        workflow.add_edge(START, "retrieve")
        workflow.add_edge("retrieve", "context")
        workflow.add_edge("context", "generate")
        workflow.add_edge("generate", END)

        graph = workflow.compile()

        print("✓ Graph built successfully")

        return graph

    def run(self, query: str):
        """
        Execute complete RAG pipeline for a given query.

        High-level method that orchestrates the entire RAG process:
        1. Displays user query
        2. Builds LangGraph workflow
        3. Creates initial state with query
        4. Executes pipeline (retrieve → context → generate)
        5. Returns final state with answer

        Args:
            query (str): User's question or search query.


        Returns:
            Dict[str, Any]: Final pipeline state containing:
                - query (str): Original user question
                - retrieved_docs (List[Document]): Retrieved documents
                    - Each Document has page_content and metadata
                    - Metadata includes: source, score
                - context (str): Formatted context passed to LLM
                - answer (str): Generated answer from LLM

        Example:
            >>> pipeline = RAGPipeline(llm, qdrant_manager)
            >>>
            >>> result = pipeline.run("What is hypertension?")
            >>>
            >>> print(f"Question: {result['query']}")
            >>> print(
            ...     f"Retrieved {len(result['retrieved_docs'])} documents"
            ... )
            >>> print(f"Answer: {result['answer']}")
        """
        if self._graph is None:
            self._graph = self._build_graph()
        graph = self._graph

        print(f"QUERY: {query}")

        graph = self._build_graph()

        initial_state = RAGState(query=query, retrieved_docs=[], context="", answer="")

        return graph.invoke(initial_state)

In [7]:
baseline_config = RAGConfig()

embedder = get_embedder(baseline_config)

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 215830.39it/s]


In [8]:
qdrant_manager = QdrantManager(config=baseline_config, embedder=embedder)
qdrant_manager.setup_qdrant(data_dir=Path("../data"))

  self.qdrant_client = QdrantClient(


✓ Qdrant is healthy. Collections: 2
Deleted existing collection 'clinical_recs'
✓ Created collection 'clinical_recs'


Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 189 0 (offset 0)
Ignoring wrong pointing object 755 0 (offset 0)
Ignoring wrong pointing object 781 0 (offset 0)
Ignoring wrong pointing object 786 0 (offset 0)
Ignoring wrong pointing object 788 0 (offset 0)
Ignoring wrong pointing object 794 0 (offset 0)
Ignoring wrong pointing object 797 0 (offset 0)
Ignoring wrong pointing object 800 0 (offset 0)
Ignoring wrong pointing object 803 0 (offset 0)
Ignoring wrong pointing object 806 0 (offset 0)
Ignoring wrong pointing object 832 0 (offset 0)
Ignoring wrong pointing object 835 0 (offset 0)
Ignoring wrong pointing object 838 0 (offset 0)
Ignoring wrong pointing object 840 0 (offset 0)
Ignoring wrong pointing object 844 0 (offset 0)
Ignoring wrong pointing object 847 0 (offset 0)
Ignoring wrong pointing object 850 0 (offset 0)
Ignoring wrong pointing object 853 0 (offset 0)
Ignoring wrong pointing object 856 0 (offset 0)
Ignoring wrong pointing object 859 0 (off

Loaded 686 documents
Indexing 182981 chunks into Qdrant...


pre tokenize: 100%|██████████| 250/250 [00:02<00:00, 121.61it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 250/250 [09:02<00:00,  2.17s/it]


Indexed 32000 chunks


pre tokenize: 100%|██████████| 250/250 [00:02<00:00, 123.65it/s]
Inference Embeddings: 100%|██████████| 250/250 [08:53<00:00,  2.13s/it]


Indexed 64000 chunks


pre tokenize: 100%|██████████| 250/250 [00:01<00:00, 126.00it/s]
Inference Embeddings: 100%|██████████| 250/250 [08:29<00:00,  2.04s/it]
  self.qdrant_client.upload_points(
  self.qdrant_client.upload_points(
  self.qdrant_client.upload_points(


PydanticSerializationError: Error serializing to JSON: UnicodeEncodeError: 'utf-8' codec can't encode character '\ud835' in position 339: surrogates not allowed

In [13]:
torch.mps.empty_cache()
llm_pipe = get_llm_pipeline(baseline_config)

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.89s/it]
Device set to use mps


In [14]:
rag_pipeline = RAGPipeline(llm_pipe, qdrant_manager)

test_queries = [
    "Какие антигипертензивные препараты рекомендуются в качестве первой линии "
    "для пациента с впервые диагностированной артериальной гипертензией II степени "
    "и отсутствием сопутствующих заболеваний?",
    "Какие диагностические критерии используются для подтверждения диагноза "
    "сахарного диабета 2 типа? Какие значения глюкозы натощак и HbA1c считаются "
    "диагностическими?",
    "Пациент 65 лет с внебольничной пневмонией, без серьёзных сопутствующих "
    "заболеваний, поступил в амбулаторное отделение. Какие антибиотики "
    "рекомендуются для начальной терапии?",
    "Пациент поступил с подозрением на острый инфаркт миокарда (ОИМ) с подъёмом "
    "сегмента ST. Какие экстренные меры и препараты должны быть применены в "
    "первые 10 минут?",
    "Пациент 50 лет жалуется на изжогу 3-4 раза в неделю. Какие "
    "немедикаментозные меры и препараты первой линии рекомендуются для "
    "лечения ГЭРБ?",
    "У пациента диагностировано умеренное депрессивное расстройство. "
    "Какие препараты являются препаратами первой линии? Какова ожидаемая "
    "длительность лечения и критерии эффективности?",
    "Пациент с ХОБЛ III стадии поступил с признаками обострения "
    "(увеличение одышки, мокроты, изменение цвета мокроты). Какова "
    "схема лечения обострения?",
    "Пациентка 45 лет с симптомами усталости, набора веса и сухости кожи. "
    "Какие диагностические тесты необходимо провести для подтверждения "
    "гипотиреоза? Как начинать заместительную терапию?",
    "Пациент поступил с острым ишемическим инсультом. Время от начала "
    "симптомов - 2 часа. Какова должна быть схема лечения? Какие препараты "
    "и процедуры показаны?",
    "Пациентка с острым циститом (дизурия, полиурия, боль внизу живота) "
    "без системных симптомов и без беременности. Какова схема лечения "
    "неосложнённого цистита?",
]

# Запускаем RAG для каждого query
for query in test_queries:
    result = rag_pipeline.run(query)

    print(f"QUERY: {result['query']}")
    print()
    print(f"\nRETRIEVED DOCUMENTS ({len(result['retrieved_docs'])}):")
    for i, doc in enumerate(result["retrieved_docs"], 1):
        print(f"\n{i}. {doc.metadata['source']} (score: {doc.metadata['score']:.4f})")
        print(f"   {doc.page_content[:200]}...")
        print()

    print("ANSWER:")
    print(result["answer"])
    print("\n" + "=" * 60 + "\n")
    torch.mps.empty_cache()


[GRAPH] Building LangGraph...
✓ Graph built successfully
QUERY: Какие антигипертензивные препараты рекомендуются в качестве первой линии для пациента с впервые диагностированной артериальной гипертензией II степени и отсутствием сопутствующих заболеваний?

[GRAPH] Building LangGraph...
✓ Graph built successfully
[RETRIEVE] Query: Какие антигипертензивные препараты рекомендуются в качестве первой линии для пациента с впервые диагностированной артериальной гипертензией II степени и отсутствием сопутствующих заболеваний?
Retrieved 5 documents
  1. ../data/КР983.pdf
  2. ../data/КР881.pdf
  3. ../data/КР840.pdf
  4. ../data/КР62.pdf
  5. ../data/КР758.pdf

[CONTEXT] Preparing context from 5 documents
Context prepared (3993 chars)
[GENERATE] Generating answer...
Generated 624 words
QUERY: Какие антигипертензивные препараты рекомендуются в качестве первой линии для пациента с впервые диагностированной артериальной гипертензией II степени и отсутствием сопутствующих заболеваний?


RETRIEVED 