In [None]:
%pip install llama-cloud-services

In [None]:
%pip install llama-index
%pip install llama-index-embeddings-huggingface
%pip install lancedb
%pip install llama-index-vector-stores-lancedb
%pip install llama-index-embeddings-gemini
%pip install -U transformers accelerate bitsandbytes
%pip install pymupdf rank_bm25 nltk seaborn wordcloud

In [None]:
import os
import json
import torch
import lancedb

from typing import List, Optional, Tuple, Dict
from abc import ABC, abstractmethod
from google import genai
from google.colab import userdata
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.utils import ResultType
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.schema import BaseNode, TextNode
from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceWindowNodeParser
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.gemini import GeminiEmbedding
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [None]:
PDF_DIR = "ad-papers-pdf"
MD_DIR = "ad-papers-md"
CHUNKS_ROOT_DIR = "ad-papers-chunked"

os.makedirs(MD_DIR, exist_ok=True)
os.makedirs(CHUNKS_ROOT_DIR, exist_ok=True)

In [None]:
class SecretManager:
    def __init__(self):
        self.google_api_key = userdata.get("GOOGLE_API_KEY")
        self.llama_cloud_key = userdata.get("LLAMA_CLOUD_API_KEY")

        if self.google_api_key:
            os.environ["GOOGLE_API_KEY"] = self.google_api_key
        if self.llama_cloud_key:
            os.environ["LLAMA_CLOUD_API_KEY"] = self.llama_cloud_key

    def get_google_key(self):
        if not self.google_api_key:
            raise ValueError("Google API Key not found in userdata.")
        return self.google_api_key

    def get_llama_key(self):
        if not self.llama_cloud_key:
            raise ValueError("LlamaCloud API Key not found in userdata.")
        return self.llama_cloud_key

## Preprocessing Pipeline

In [None]:
class BaseParser(ABC):
    @abstractmethod
    def parse(self, file_path: str, metadata: dict = None) -> List[Document]:
        pass


class GeminiParser(BaseParser):
    def __init__(self, secret_manager: SecretManager, model_name: str = "models/gemini-pro-latest"):
        self.client = genai.Client(api_key=secret_manager.get_google_key())
        self.model_name = model_name
        self.prompt = """
        The provided document is a scientific research paper.
        Your goal is to extract ALL text, tables, and formulas into Markdown format.
        1. Transcribe text STRICTLY VERBATIM. Do not summarize, shorten, or rephrase.
        2. Do not skip any sections, subsections, or paragraphs, even if they look dense.
        3. Maintain the reading order of the paragraphs and columns.
        4. Don't include the figures/images. Instead, provide a description of the content of the figure, along with the caption.
        5. Exclude the headers and footers of the pages.
        """

    def parse(self, file_path: str, metadata: dict = None) -> List[Document]:
        base_name = os.path.basename(file_path).replace(".pdf", ".md")
        cache_path = os.path.join(MD_DIR, base_name)

        text_content = ""

        if os.path.exists(cache_path):
            print(f"Loading cached markdown for: {base_name}")
            with open(cache_path, "r", encoding="utf-8") as f:
                text_content = f.read()
        else:
            print(f"Parsing with Gemini (API Call): {file_path}...")
            file_ref = self.client.files.upload(file=file_path)
            try:
                response = self.client.models.generate_content(
                    model=self.model_name,
                    contents=[file_ref, self.prompt]
                )
                text_content = response.text

                with open(cache_path, "w", encoding="utf-8") as f:
                    f.write(text_content)
            finally:
                self.client.files.delete(name=file_ref.name)

        doc_metadata = metadata or {}
        doc_metadata["file_path"] = file_path

        return [Document(text=text_content, metadata=doc_metadata)]

In [None]:
class BaseChunker(ABC):
    @abstractmethod
    def chunk(self, documents: List[Document]) -> List[BaseNode]:
        pass

    @abstractmethod
    def get_strategy_name(self) -> str:
        pass

    def _chunk_with_cache(self, documents: List[Document], splitter) -> List[BaseNode]:
        strategy_name = self.get_strategy_name()
        strategy_dir = os.path.join(CHUNKS_ROOT_DIR, strategy_name)
        os.makedirs(strategy_dir, exist_ok=True)

        all_nodes = []

        for doc in documents:
            file_path = doc.metadata.get("file_path")

            if not file_path:
                all_nodes.extend(splitter.get_nodes_from_documents([doc]))
                continue

            base_name = os.path.basename(file_path).replace(".pdf", ".json")
            cache_path = os.path.join(strategy_dir, base_name)

            if os.path.exists(cache_path):
                print(f"Loading cached chunks for {base_name} ({strategy_name})...")
                with open(cache_path, "r", encoding="utf-8") as f:
                    nodes_data = json.load(f)
                    all_nodes.extend([TextNode.from_dict(n) for n in nodes_data])
            else:
                print(f"Computing chunks for {base_name} ({strategy_name})...")
                nodes = splitter.get_nodes_from_documents([doc])

                with open(cache_path, "w", encoding="utf-8") as f:
                    json.dump([n.to_dict() for n in nodes], f)

                all_nodes.extend(nodes)

        return all_nodes


class SemanticChunker(BaseChunker):
    def __init__(self,
                 secret_manager: SecretManager,
                 embed_model_type: str = "huggingface",
                 model_name: str = "BAAI/bge-m3",
                 breakpoint_percentile: int = 80,
                 device: str = "cpu"):

        self.percentile = breakpoint_percentile
        self.model_name = model_name

        if embed_model_type == "huggingface":
            self.embed_model = HuggingFaceEmbedding(
                model_name=model_name,
                trust_remote_code=True,
                device=device
            )
        elif embed_model_type == "gemini":
            self.embed_model = GeminiEmbedding(
                model_name=model_name,
                api_key=secret_manager.get_google_key()
            )
        else:
            raise ValueError(f"Unknown embedding type: {embed_model_type}")

        self.splitter = SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=breakpoint_percentile,
            embed_model=self.embed_model
        )

    def get_strategy_name(self) -> str:
        clean_model = self.model_name.replace("/", "_").replace("-", "_")
        return f"semantic_{self.percentile}_{clean_model}"

    def chunk(self, documents: List[Document]) -> List[BaseNode]:
        print(f"Chunking with SemanticSplitter ({self.get_strategy_name()})...")
        return self._chunk_with_cache(documents, self.splitter)


class WindowChunker(BaseChunker):
    def __init__(self, window_size: int = 5):
        self.window_size = window_size
        self.splitter = SentenceWindowNodeParser(
            window_size=self.window_size,
            window_metadata_key="window",
            original_text_metadata_key="original_text",
        )

    def get_strategy_name(self) -> str:
        return f"window_{self.window_size}"

    def chunk(self, documents: List[Document]) -> List[BaseNode]:
        print(f"Chunking with WindowSplitter (size={self.window_size})...")
        return self._chunk_with_cache(documents, self.splitter)

In [None]:
class LanceDBManager:
    def __init__(self,
                 secret_manager: SecretManager,
                 db_uri: str = "./lancedb_data",
                 embed_model_type: str = "huggingface",
                 model_name: str = "BAAI/bge-m3",
                 device: str = "cpu"):

        self.db_uri = db_uri
        self.db = lancedb.connect(db_uri)

        if embed_model_type == "huggingface":
            self.embed_model = HuggingFaceEmbedding(
                model_name=model_name,
                trust_remote_code=True,
                device=device
            )
        elif embed_model_type == "gemini":
            self.embed_model = GeminiEmbedding(
                model_name=model_name,
                api_key=secret_manager.get_google_key()
            )

        self.model_name_clean = model_name.replace("/", "_").replace("-", "_")

    def store_data(self, nodes: List[BaseNode], chunking_strategy_name: str) -> VectorStoreIndex:
        table_name = f"{chunking_strategy_name}_embed_{self.model_name_clean}"
        print(f"--- Accessing Table: {table_name} ---")

        existing_tables = self.db.list_tables().tables

        if table_name in existing_tables:
            print("Table exists. Loading index...")
            vector_store = LanceDBVectorStore(uri=self.db_uri, table_name=table_name)
            index = VectorStoreIndex.from_vector_store(
                vector_store=vector_store,
                embed_model=self.embed_model
            )
        else:
            print("Table not found. Creating and Indexing (this takes time)...")
            vector_store = LanceDBVectorStore(uri=self.db_uri, table_name=table_name, mode="overwrite")
            storage_context = StorageContext.from_defaults(vector_store=vector_store)

            index = VectorStoreIndex(
                nodes,
                storage_context=storage_context,
                embed_model=self.embed_model
            )
            print("Indexing complete.")

        return index

    def get_retriever(self, index: VectorStoreIndex, similarity_top_k: int = 5):
        return index.as_retriever(similarity_top_k=similarity_top_k)

In [None]:
class PreprocessingPipeline:
    def __init__(self,
                 parser: BaseParser,
                 chunker: BaseChunker,
                 db_manager: LanceDBManager):
        self.parser = parser
        self.chunker = chunker
        self.db_manager = db_manager

    def run(self, file_paths: List[str], citation_metadata: List[dict]):
        all_documents = []

        for path, meta in zip(file_paths, citation_metadata):
            print(f"Processing: {path}...")
            docs = self.parser.parse(path, metadata=meta)
            all_documents.extend(docs)

        nodes = self.chunker.chunk(all_documents)

        strategy_name = self.chunker.get_strategy_name()
        index = self.db_manager.store_data(nodes, strategy_name)

        return index

## Query Pipeline

In [None]:
class QueryRephraser:
    def __init__(self, secret_manager: SecretManager, model_name: str = "models/gemini-pro-latest"):
        self.client = genai.Client(api_key=secret_manager.get_google_key())
        self.model_name = model_name

    def rephrase(self, query: str) -> str:
        prompt = f"""
        You are an AI research assistant. The user is asking a question about "Isolation Forests" or anomaly detection.
        Rephrase the following question to be more specific and optimized for a vector search engine.
        - Keep the core intent.
        - Expand technical acronyms (e.g., "IF" -> "Isolation Forest").
        - If the query is a simple keyword, turn it into a full sentence.

        Original Query: {query}
        Rephrased Query:
        """
        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt
        )
        new_query = response.text.strip()
        print(f"Rephrased: '{query}' -> '{new_query}'")
        return new_query

In [None]:
class RetrieverModule:
    def __init__(self,
                 db_manager: LanceDBManager,
                 chunking_strategy: str,
                 embed_model_name: str):

        self.db_manager = db_manager

        clean_model = embed_model_name.replace("/", "_").replace("-", "_")
        self.table_name = f"{chunking_strategy}_embed_{clean_model}"

        print(f"Connecting Retriever to table: {self.table_name}")

        vector_store = LanceDBVectorStore(uri=db_manager.db_uri, table_name=self.table_name)

        self.index = VectorStoreIndex.from_vector_store(
            vector_store=vector_store,
            embed_model=db_manager.embed_model
        )

    def retrieve(self, query: str, top_k: int = 5) -> List[BaseNode]:
        retriever = self.index.as_retriever(similarity_top_k=top_k)
        nodes = retriever.retrieve(query)
        print(f"Retrieved {len(nodes)} raw chunks.")
        return nodes

In [None]:
class GeminiReranker:
    def __init__(self, secret_manager: SecretManager, model_name: str = "models/gemini-pro-latest"):
        self.client = genai.Client(api_key=secret_manager.get_google_key())
        self.model_name = model_name

    def rerank(self, query: str, nodes: List[BaseNode], top_n: int = 3) -> List[BaseNode]:
        if not nodes:
            return []

        candidates_text = ""
        for i, node in enumerate(nodes):
            content_text = node.metadata.get("window", node.text)
            candidates_text += f"ID: {i}\nContent: {content_text}...\n\n"

        prompt = f"""
        You are a relevance ranking system.
        Query: "{query}"

        Below are candidate text chunks retrieved for this query.
        Rank them by relevance to the query.
        Return ONLY the IDs of the top {top_n} most relevant chunks, separated by commas.
        If a chunk is completely irrelevant, exclude it.

        Candidates:
        {candidates_text}

        Result IDs:
        """

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt
        )

        try:
            indices_str = response.text.strip().replace("Result IDs:", "")
            selected_indices = [int(idx.strip()) for idx in indices_str.split(",") if idx.strip().isdigit()]

            reranked_nodes = [nodes[i] for i in selected_indices if i < len(nodes)]
            print(f"Reranked: Kept {len(reranked_nodes)}/{len(nodes)} chunks.")
            return reranked_nodes

        except Exception as e:
            print(f"Reranking failed ({e}), returning original top {top_n}.")
            return nodes[:top_n]

In [None]:
class RetrievalEvaluator:
    """
    Analyzes the retrieved chunks to determine if we can answer.
    States:
    1. ANSWERABLE: Good chunks found.
    2. NO_DATA: Query is relevant to domain (Isolation Forest), but specific details are missing.
    3. UNRELATED: Query is about "cooking" or "weather".
    """
    def __init__(self, secret_manager: SecretManager, model_name: str = "models/gemini-pro-latest"):
        self.client = genai.Client(api_key=secret_manager.get_google_key())
        self.model_name = model_name

    def evaluate(self, query: str, nodes: List[BaseNode]) -> Tuple[str, str]:
        context_parts = []
        for n in nodes:
            content_text = n.metadata.get("window", n.text)
            context_parts.append(content_text)
        context_text = "\n\n".join(context_parts)

        prompt = f"""
        You are the "Gatekeeper" for a Research Assistant about Anomaly Detection (Isolation Forests).
        Your job is to classify the relationship between the USER QUERY and the RETRIEVED CONTEXT.

        USER QUERY: {query}

        RETRIEVED CONTEXT:
        {context_text}

        Task: Analyze the inputs and output ONE of the following JSON strings:

        1. If the query is completely unrelated to Computer Science/Anomaly Detection (e.g. "How to cook pasta", "What is the weather"):
           {{"status": "UNRELATED", "reason": "The user is asking about [Topic] which is outside the scope of this research assistant."}}

        2. If the query IS related to the domain, but the Retrieved Context DOES NOT contain the answer:
           {{"status": "NO_DATA", "reason": "The query is relevant, but the provided papers do not discuss this specific detail."}}

        3. If the Retrieved Context contains the answer:
           {{"status": "ANSWERABLE", "reason": "Context contains sufficient information."}}

        OUTPUT JSON ONLY:
        """

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt,
            config={'response_mime_type': 'application/json'}
        )

        result = json.loads(response.text)
        print(f"Evaluation Status: {result['status']}")
        return result['status'], result['reason']

In [None]:
class ResponseGenerator:
    def __init__(self, secret_manager: SecretManager, model_name: str = "models/gemini-pro-latest"):
        self.client = genai.Client(api_key=secret_manager.get_google_key())
        self.model_name = model_name

    def generate(self, query: str, nodes: List[BaseNode]) -> str:
        context_str = ""
        for i, node in enumerate(nodes):
            meta = node.metadata

            title = meta.get('title', 'Unknown Title')
            author = meta.get('authors', 'Unknown Authors')
            year = meta.get('year', 'n.d.')
            citation_tag = f"[{title}, {author}, {year}]"

            content_text = meta.get("window", node.text)

            context_str += f"--- Source {i+1} {citation_tag} ---\n{content_text}\n\n"

        prompt = f"""
        You are a Research Assistant. Answer the question using ONLY the provided context.

        Rules:
        1. Cite your sources using the format [Title, Author, Year] provided in the header of each source.
        2. Do not hallucinate information not present in the text.
        3. If the context has multiple papers, synthesize them.

        Context:
        {context_str}

        Question: {query}

        Answer:
        """

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt
        )
        return response.text

In [None]:
class QueryPipeline:
    def __init__(self,
                 rephraser: Optional[QueryRephraser],
                 retriever: RetrieverModule,
                 reranker: Optional[GeminiReranker],
                 evaluator: RetrievalEvaluator,
                 generator: ResponseGenerator):

        self.rephraser = rephraser
        self.retriever = retriever
        self.reranker = reranker
        self.evaluator = evaluator
        self.generator = generator

    def run(self, user_query: str, use_rephrasing: bool = True, use_reranking: bool = True):
        print(f"\n--- Starting pipeline for: '{user_query}' ---")

        search_query = user_query
        if self.rephraser and use_rephrasing:
            search_query = self.rephraser.rephrase(user_query)

        retrieved_nodes = self.retriever.retrieve(search_query, top_k=20 if self.reranker else 15)

        final_nodes = retrieved_nodes
        if self.reranker and use_reranking:
            final_nodes = self.reranker.rerank(search_query, retrieved_nodes, top_n=10)

        status, reason = self.evaluator.evaluate(user_query, final_nodes)

        if status == "UNRELATED":
            return f"**Query Rejected:** {reason}\n(I only answer questions about the provided research papers.)"

        elif status == "NO_DATA":
            return f"**No Information Found:** {reason}\n(I searched the database but couldn't find specific details on this.)"

        print("Generating answer...")
        answer = self.generator.generate(user_query, final_nodes)
        return answer

##  Preprocessing corpus

In [None]:
BGE_EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
BGE_EMBEDDING_MODEL_CLEAN_NAME = BGE_EMBEDDING_MODEL_NAME.replace("/", "_").replace("-", "_")
GEMINI_EMBEDDING_MODEL_NAME = "models/gemini-embedding-001"
GEMINI_EMBEDDING_MODEL_CLEAN_NAME = GEMINI_EMBEDDING_MODEL_NAME.replace("/", "_").replace("-", "_")

In [None]:
files = [
    os.path.join(PDF_DIR, "extended_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "extended_kmeans_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "functional_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "generalized_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "kernel_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "kmeans_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "probabilistic_generalization_of_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "randomised_choices_in_isolation_forest.pdf"),
    os.path.join(PDF_DIR, "scoring_isolation_forest.pdf")
]
meta = [
    {"title": "Extended Isolation Forest", "authors": "Hariri et al.", "year": 2021},
    {"title": "Extended K-Means Isolation Forest", "authors": "Vlad Birsan", "year": 2025},
    {"title": "Functional Isolation Forest", "authors": "Staerman", "year": 2019},
    {"title": "Generalized isolation forest for anomaly detection", "authors": "Lesouple et al.", "year": 2021},
    {"title": "Hyperspectral anomaly detection with kernel isolation forest", "authors": "Li et al.", "year": 2019},
    {"title": "K-means-based isolation forest", "authors": "Karczmarek et al.", "year": 2020},
    {"title": "A probabilistic generalization of isolation forest", "authors": "Tokovarov,", "year": 2022},
    {"title": "Revisiting randomized choices in isolation forests", "authors": "Cortes et al.", "year": 2021},
    {"title": "Distribution and volume based scoring for Isolation Forests", "authors": "Dhouib et al.", "year": 2023}
]

In [None]:
secrets = SecretManager()
parser = GeminiParser(secrets)
bge_chunker = SemanticChunker(
    secrets,
    embed_model_type="huggingface",
    model_name=BGE_EMBEDDING_MODEL_NAME,
    breakpoint_percentile=80,
    device=DEVICE
)
gemini_chunker = SemanticChunker(
    secrets,
    embed_model_type="gemini",
    model_name=GEMINI_EMBEDDING_MODEL_NAME,
    breakpoint_percentile=80,
    device=DEVICE
)
window_chunker = WindowChunker(
    window_size=5
)
bge_db_manager = LanceDBManager(
    secrets,
    embed_model_type="huggingface",
    model_name=BGE_EMBEDDING_MODEL_NAME,
    device=DEVICE
)
gemini_db_manager = LanceDBManager(
    secrets,
    embed_model_type="gemini",
    model_name=GEMINI_EMBEDDING_MODEL_NAME,
    device=DEVICE
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

  self.embed_model = GeminiEmbedding(
  self.embed_model = GeminiEmbedding(


### Preprocessing corpus using semantic chunking with bge-m3 model and bge-m3 embeddings  

In [None]:
pipeline = PreprocessingPipeline(parser, bge_chunker, bge_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)

Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach

### Preprocessing corpus using semantic chunking with bge-m3 model and gemini embeddings  

In [None]:
pipeline = PreprocessingPipeline(parser, bge_chunker, gemini_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)



Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach

### Preprocessing corpus using semantic chunking with gemini model and bge-m3 embeddings  

In [None]:
pipeline = PreprocessingPipeline(parser, gemini_chunker, bge_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)

Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach



--- Accessing Table: semantic_80_models_gemini_embedding_001_embed_BAAI_bge_m3 ---
Table not found. Creating and Indexing (this takes time)...
Indexing complete.


### Preprocessing corpus using semantic chunking with gemini model and gemini embeddings  

In [None]:
pipeline = PreprocessingPipeline(parser, gemini_chunker, gemini_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)



Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach

### Preprocessing corpus using window chunking and bge-m3 embeddings  

In [None]:
pipeline = PreprocessingPipeline(parser, window_chunker, bge_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)

Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach



--- Accessing Table: window_5_embed_BAAI_bge_m3 ---
Table not found. Creating and Indexing (this takes time)...
Indexing complete.


### Preprocessing corpus using window chunking and gemini embeddings

In [None]:
pipeline = PreprocessingPipeline(parser, window_chunker, gemini_db_manager)
print("Starting preprocessing pipeline...")
_ = pipeline.run(files, meta)

Starting preprocessing pipeline...
Processing: ad-papers-pdf/extended_isolation_forest.pdf...
Loading cached markdown for: extended_isolation_forest.md
Processing: ad-papers-pdf/extended_kmeans_isolation_forest.pdf...
Loading cached markdown for: extended_kmeans_isolation_forest.md
Processing: ad-papers-pdf/functional_isolation_forest.pdf...
Loading cached markdown for: functional_isolation_forest.md
Processing: ad-papers-pdf/generalized_isolation_forest.pdf...
Loading cached markdown for: generalized_isolation_forest.md
Processing: ad-papers-pdf/kernel_isolation_forest.pdf...
Loading cached markdown for: kernel_isolation_forest.md
Processing: ad-papers-pdf/kmeans_isolation_forest.pdf...
Loading cached markdown for: kmeans_isolation_forest.md
Processing: ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf...
Loading cached markdown for: probabilistic_generalization_of_isolation_forest.md
Processing: ad-papers-pdf/randomised_choices_in_isolation_forest.pdf...
Loading cach



Loading cached chunks for probabilistic_generalization_of_isolation_forest.json (window_5)...
Loading cached chunks for randomised_choices_in_isolation_forest.json (window_5)...
Loading cached chunks for scoring_isolation_forest.json (window_5)...
--- Accessing Table: window_5_embed_models_gemini_embedding_001 ---
Table not found. Creating and Indexing (this takes time)...
Indexing complete.


## Question answering

In [None]:
BGE_CHUNKING_SRATEGY_NAME = f"semantic_80_{BGE_EMBEDDING_MODEL_CLEAN_NAME}"
GEMINI_CHUNKING_SRATEGY_NAME = f"semantic_80_{GEMINI_EMBEDDING_MODEL_CLEAN_NAME}"
WINDOW_CHUNKING_SRATEGY_NAME = "window_5"

In [None]:
questions = [
    {"question": "How does Extended Isolation Forest fix the bias issues?", "label": "ANSWERABLE"},
    {"question": "What is the best recipe for pizza?", "label": "UNRELATED"},
    {"question": "How does Isolation Forest perform on Quantum Computers?", "label": "NO_DATA"}
]

In [None]:
questions = [
    # --- ANSWERABLE (14 Questions) ---

    # 1. From 'Extended Isolation Forest' (Hariri et al.)
    {
        "question": "What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?",
        "label": "ANSWERABLE"
    },
    # 2. From 'Extended Isolation Forest' (Hariri et al.) - (Your original question)
    {
        "question": "How does Extended Isolation Forest fix the bias issues found in the standard algorithm?",
        "label": "ANSWERABLE"
    },
    # 3. From 'Functional Isolation Forest' (Staerman et al.)
    {
        "question": "How does Functional Isolation Forest (FIF) project data using a dictionary and scalar products?",
        "label": "ANSWERABLE"
    },
    # 4. From 'Hyperspectral Anomaly Detection with Kernel Isolation Forest' (Li et al.)
    {
        "question": "Why are anomalies assumed to be more susceptible to isolation in the kernel space according to the Kernel Isolation Forest paper?",
        "label": "ANSWERABLE"
    },
    # 5. From 'Generalized Isolation Forest' (Lesouple et al.)
    {
        "question": "How does Generalized Isolation Forest (GIF) improve upon Extended Isolation Forest regarding empty branches?",
        "label": "ANSWERABLE"
    },
    # 6. From 'K-Means-based Isolation Forest' (Karczmarek et al.)
    {
        "question": "How does the K-Means Isolation Forest algorithm combine the partition strategy with the K-Means clustering algorithm?",
        "label": "ANSWERABLE"
    },
    # 7. From 'Extended K-Means Isolation Forest' (Birsan, 2025)
    {
        "question": "What are the two hybrid algorithms introduced in the Extended K-Means Isolation Forest paper?",
        "label": "ANSWERABLE"
    },
    # 8. From 'Probabilistic Generalization of Isolation Forest' (Tokovarov et al.)
    {
        "question": "How does the Probabilistic Generalization of Isolation Forest (PGIF) use segment-cumulated probability?",
        "label": "ANSWERABLE"
    },
    # 9. From 'Distribution and volume based scoring' (Dhouib et al.)
    {
        "question": "How does the Rényi divergence relate to the aggregation functions in distribution-based scoring for Isolation Forests?",
        "label": "ANSWERABLE"
    },
    # 10. From 'Revisiting randomized choices' (Cortes et al.)
    {
        "question": "According to the 'Revisiting randomized choices' paper, how does non-uniform random splitting affect the detection of clustered outliers?",
        "label": "ANSWERABLE"
    },
    # 11. From 'Kernel Isolation Forest'
    {
        "question": "What is the specific application domain (type of images) that the Kernel Isolation Forest is designed to analyze?",
        "label": "ANSWERABLE"
    },
    # 12. From 'Extended K-Means Isolation Forest'
    {
        "question": "Which benchmark metrics were used to evaluate the Extended K-Means Isolation Forest on the 13 datasets?",
        "label": "ANSWERABLE"
    },
    # 13. From 'Functional Isolation Forest'
    {
        "question": "What is the 'visual elbow rule' used for in the context of Functional Isolation Forest experiments?",
        "label": "ANSWERABLE"
    },
    # 14. From 'Generalized Isolation Forest'
    {
        "question": "What is the main advantage of Generalized Isolation Forest (GIF) over Extended Isolation Forest (EIF) in terms of computation time?",
        "label": "ANSWERABLE"
    },

    # --- NO DATA (4 Questions) ---

    {
        "question": "How does the performance of Isolation Forest compare to an LSTM-based Autoencoder on time-series data?",
        "label": "NO_DATA"
    },
    {
        "question": "What are the specific latency requirements for deploying Isolation Forest on an Arduino or edge device?",
        "label": "NO_DATA"
    },
    {
        "question": "How can I implement the Isolation Forest algorithm using the H2O.ai library in R?",
        "label": "NO_DATA"
    },
    {
        "question": "Does the 'Deep Isolation Forest' variant use Convolutional Neural Networks for feature extraction?",
        "label": "NO_DATA"
    },

    # --- UNRELATED (2 Questions) ---

    {
        "question": "What is the best recipe for pizza?",
        "label": "UNRELATED"
    },
    {
        "question": "Who won the FIFA World Cup in 2022?",
        "label": "UNRELATED"
    }
]

In [None]:
secrets = SecretManager()
rephraser = QueryRephraser(secrets)

bge_db_manager = LanceDBManager(
    secrets,
    embed_model_type="huggingface",
    model_name=BGE_EMBEDDING_MODEL_NAME,
    device=DEVICE
)
gemini_db_manager = LanceDBManager(
    secrets,
    embed_model_type="gemini",
    model_name=GEMINI_EMBEDDING_MODEL_NAME,
    device=DEVICE
)

bge_chunker_bge_embed_retriever_mod = RetrieverModule(
    db_manager=bge_db_manager,
    chunking_strategy=BGE_CHUNKING_SRATEGY_NAME,
    embed_model_name=BGE_EMBEDDING_MODEL_NAME
)
bge_chunker_gemini_embed_retriever_mod = RetrieverModule(
    db_manager=gemini_db_manager,
    chunking_strategy=BGE_CHUNKING_SRATEGY_NAME,
    embed_model_name=GEMINI_EMBEDDING_MODEL_NAME
)
gemini_chunker_bge_embed_retriever_mod = RetrieverModule(
    db_manager=bge_db_manager,
    chunking_strategy=GEMINI_CHUNKING_SRATEGY_NAME,
    embed_model_name=BGE_EMBEDDING_MODEL_NAME
)
gemini_chunker_gemini_embed_retriever_mod = RetrieverModule(
    db_manager=gemini_db_manager,
    chunking_strategy=GEMINI_CHUNKING_SRATEGY_NAME,
    embed_model_name=GEMINI_EMBEDDING_MODEL_NAME
)
window_chunker_bge_embed_retriever_mod = RetrieverModule(
    db_manager=bge_db_manager,
    chunking_strategy=WINDOW_CHUNKING_SRATEGY_NAME,
    embed_model_name=BGE_EMBEDDING_MODEL_NAME
)
window_chunker_gemini_embed_retriever_mod = RetrieverModule(
    db_manager=gemini_db_manager,
    chunking_strategy=WINDOW_CHUNKING_SRATEGY_NAME,
    embed_model_name=GEMINI_EMBEDDING_MODEL_NAME
)

reranker = GeminiReranker(secrets)
evaluator = RetrievalEvaluator(secrets)
generator = ResponseGenerator(secrets)

bge_chunker_bge_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=bge_chunker_bge_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)
bge_chunker_gemini_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=bge_chunker_gemini_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)
gemini_chunker_bge_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=gemini_chunker_bge_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)
gemini_chunker_gemini_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=gemini_chunker_gemini_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)
window_chunker_bge_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=window_chunker_bge_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)
window_chunker_gemini_embed_query_pipeline = QueryPipeline(
    rephraser=rephraser,
    retriever=window_chunker_gemini_embed_retriever_mod,
    reranker=reranker,
    evaluator=evaluator,
    generator=generator
)

  self.embed_model = GeminiEmbedding(


Connecting Retriever to table: semantic_80_BAAI_bge_m3_embed_BAAI_bge_m3
Connecting Retriever to table: semantic_80_BAAI_bge_m3_embed_models_gemini_embedding_001
Connecting Retriever to table: semantic_80_models_gemini_embedding_001_embed_BAAI_bge_m3
Connecting Retriever to table: semantic_80_models_gemini_embedding_001_embed_models_gemini_embedding_001
Connecting Retriever to table: window_5_embed_BAAI_bge_m3
Connecting Retriever to table: window_5_embed_models_gemini_embedding_001


In [None]:
def answer_questions(
    questions: List[Dict[str, str]],
    pipeline: QueryPipeline,
    use_rephrasing: bool,
    use_reranking: bool
) -> None:
    for q_data in questions:
        user_query = q_data["question"]
        expected_label = q_data["label"]

        print(f"\nProcessing: '{user_query}'")

        response = pipeline.run(
            user_query=user_query,
            use_rephrasing=use_rephrasing,
            use_reranking=use_reranking
        )

        print(f"Response:\n{response}")
        print(f"Expected Label: {expected_label}")
        print("-" * 60)

### Question answering: semantic chunker with bge-m3 model, bge-m3 embeddings

#### Question rephrasing, chunks reranking


In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'What is the specific scoring artifact, caused by axis-parallel splits in the standard Isolation Forest algorithm, that creates rectangular patterns in anomaly score heat maps, and how does Extended Isolation Forest's use of random hyperplanes fix this issue?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in anomaly score heat maps because its branching procedure is restri

#### Question rephrasing, no chunks reranking


In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'What are the specific rectangular, axis-aligned artifacts produced in anomaly score heatmaps by the standard Isolation Forest algorithm due to its use of axis-parallel splits, and how does the Extended Isolation Forest algorithm mitigate these artifacts by using hyperplanes with random slopes?'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest algorithm produces several specific artifacts in its anomaly score heat maps tha

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest (IF) produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) aims to fix:

*   **Axis-Parallel Bands:** The standard IF creates "line patterns parallel to the coordinate axes" [Extended K-Means Isolation Forest, Vlad Birsan, 2025]. For a single cluster of normally distributed data where a circular score map is expected, the standard IF produces a map with "rectangular regions of lower anomaly score in the x and y directions" or darker bands exte

#### No question rephrasing, no chunks rearanking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix. These artifacts are artificial zones of higher or lower anomaly scores that are not present in the original data distribution [Extended Isolation Forest, Hariri et al., 2021].

The primary artifacts are:
*   **Axis-Parallel Bands and "Cross" Shapes:** The most common artifact is the appearance of line patterns or bands that are parallel to the coordinate axes. For a single cluster of normally distri

### Question answering: semantic chunker with bge-m3 model, gemini embeddings

#### Question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'How do the axis-parallel splits of the standard Isolation Forest algorithm create scoring artifacts and rectangular boundaries in anomaly detection heatmaps, and how does the use of random-slope hyperplanes in Extended Isolation Forest mitigate these limitations?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the specific artifact that the standard Isolation Forest (IF) produces in anomaly score heat maps is the presenc

#### Question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'How do the axis-parallel splits of the standard Isolation Forest algorithm create visual artifacts, such as rectangular patterns and spurious high-score regions, in its anomaly score heatmaps, and how does the Extended Isolation Forest algorithm fix this limitation with random hyperplane splits?'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in anomaly score heat maps, which the Exte

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest is designed to fix:

*   **Axis-Parallel Line Patterns:** The standard algorithm generates "line patterns parallel to the coordinate axes" [Extended K-Means Isolation Forest, Vlad Birsan, 2025]. These appear as "distinct horizontal and vertical bands" or "rectangular regions" in the anomaly score maps [Extended Isolation Forest, Hariri et al., 2021]. For a single, circular cluster of data, this a

#### No question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=bge_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix. These artifacts are a result of the algorithm's reliance on branch cuts that are always parallel to the coordinate axes (i.e., horizontal or vertical) [Extended Isolation Forest, Hariri et al., 2021].

The primary artifacts are:

*   **Axis-Parallel Bands and Rectangular Patterns**: For a single cluster of normally distributed data that should have a circular anomaly score map, the standard Isolatio

### Question answering: semantic chunker with gemini model, bge embeddings

#### Question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> '**Rephrased Query:** Explain the visual artifacts, such as rectangular patterns and ghost regions, that appear in the anomaly score surfaces of standard Isolation Forests. How do these artifacts result from the algorithm's use of axis-parallel splits, and how does Extended Isolation Forest's use of random hyperplanes solve this issue?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces 

#### Question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'What are the specific visual artifacts, such as rectangular patterns or ghost clusters in anomaly score heatmaps, that are caused by the standard Isolation Forest's use of axis-parallel splits? How does the Extended Isolation Forest algorithm mitigate these artifacts by using random non-axis-parallel hyperplanes for its splits?'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest algorithm produces artifacts in anomaly score

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest algorithm produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix:

*   **Axis-Aligned Bands:** For a single cluster of data, the standard Isolation Forest creates "rectangular regions" or "bands" of lower anomaly scores aligned with the x and y axes [Extended Isolation Forest, Hariri et al., 2021; Generalized isolation forest for anomaly detection, Lesouple et al., 2021]. This results in a score map that has an artifact resem

#### No question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix:

*   **Axis-Aligned Bands:** For a simple, single cluster of data that should produce a circular score map, the standard Isolation Forest creates artificial horizontal and vertical bands of inconsistent scores [Extended Isolation Forest, Hariri et al., 2021]. These bands appear as "rectangular regions of lower anomaly score" extending along the x and y directions, creating a cross-like pattern instead o

### Question answering: semantic chunker with gemini model, gemini embeddings

#### Question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> '**Rephrased Query:** What are the specific visual artifacts, such as rectangular patterns or scoring biases, present in the anomaly score heatmaps of standard Isolation Forests, which the Extended Isolation Forest model was designed to address?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in its anomaly score heat maps that appear as rectangular bands aligned with the c

#### Question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> '**Rephrased Query:**
Describe the scoring artifacts and visual biases, such as rectangular patterns or ghost clusters, that are produced in anomaly score heatmaps by a standard Isolation Forest due to its axis-parallel splits. How does the Extended Isolation Forest's use of random hyperplane splits resolve these specific issues?'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in its anomaly score heat

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in anomaly score heat maps due to its branching procedure, which only creates cuts parallel to the coordinate axes [Extended Isolation Forest, Hariri et al., 2021]. This introduces a bias that results in artificial zones of high or low scores not present in the original data [Extended Isolation Forest, Hariri et al., 2021].

These specific artifacts manifest in several ways:

*   **Rectangular Bands:** For a single, circular cluster of data points, the standard Isolation Forest

#### No question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=gemini_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest algorithm produces several specific artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix:

*   **Axis-Parallel Bands and Rectangular Patterns**: The standard Isolation Forest's branching procedure, which slices data only along hyperplanes parallel to the coordinate frame, introduces a bias [Extended Isolation Forest, Hariri et al., 2021]. This results in "rectangular regions of lower anomaly score" extending in the x and y directions. For a single cluster of normally distribute

### Question answering: window chunker, bge embeddings

#### Question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'What are the scoring artifacts, such as rectangular iso-lines and axis-parallel decision boundaries, produced by the standard Isolation Forest algorithm in its anomaly score heat maps? How does the Extended Isolation Forest model fix these artifacts by using random sloped hyperplanes for its splits?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several related artifacts in anomaly

#### Question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_bge_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'How does the Extended Isolation Forest algorithm address the grid-like artifacts and sharp, axis-parallel boundaries seen in the anomaly score heat maps produced by the standard Isolation Forest, which result from its use of axis-aligned splits?'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in anomaly score heat maps that manifest as distinct horizontal and vertical bands [Extended Isolation Forest,

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest (IF) produces several related artifacts in its anomaly score heat maps that the Extended Isolation Forest (EIF) aims to fix:

*   **Cross-Like Artifacts:** The most prominent artifact is the appearance of "distinct vertical and horizontal bands" of inconsistent anomaly scores that extend from data clusters, creating a "cross-like artifact" [Hariri et al., 2021; Lesouple et al., 2021]. For a single, circular cluster of data, this causes the anomaly score map to look like a "rounded square" rather 

#### No question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_bge_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest (IF) produces several specific artifacts in anomaly score heat maps that the Extended Isolation Forest (EIF) is designed to fix:

*   **Axis-Parallel Artifacts:** The most prominent artifact is the appearance of "distinct horizontal and vertical bands" or "line patterns parallel to the coordinate axes" in the anomaly score maps [Extended Isolation Forest, Hariri et al., 2021; Extended K-Means Isolation Forest, Vlad Birsan, 2025]. For a single cluster of data, these bands create an artifact that resembles a cross [Generalized

### Question answering: window chunker, gemini embeddings

#### Question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> 'In anomaly detection, what scoring artifacts are produced in the anomaly score heatmaps of a standard Isolation Forest due to its axis-parallel branching, and how does the Extended Isolation Forest algorithm mitigate these artifacts using random slope hyperplanes?'
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the specific artifact that the standard Isolation Forest (IF) produces in anomaly score heat maps are artificia

#### Question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_gemini_embed_query_pipeline,
    use_rephrasing=True,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Rephrased: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' -> '**Rephrased Query:**
Describe the rectangular or grid-like artifacts in anomaly score heat maps produced by the standard Isolation Forest algorithm's axis-parallel splits, and explain how the Extended Isolation Forest algorithm mitigates this issue using hyperplanes with random slopes.'
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces several specific artifacts in anomaly score heat maps that the Extended Isolati

#### No question rephrasing, chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=True)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Reranked: Kept 10/20 chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in its anomaly score heat maps, which Extended Isolation Forest (EIF) is designed to fix. These artifacts are generated by the algorithm's binary tree branching criteria [Extended Isolation Forest, Hariri et al., 2021].

The specific artifacts are:

*   **Horizontal and Vertical Bands:** For a single cluster of data, the standard Isolation Forest creates "distinct horizontal and vertical bands of higher anomaly scores" or "rectangular regions of lower anomaly score in the x and

#### No question rephrasing, no chunks reranking

In [None]:
answer_questions(
    questions=questions,
    pipeline=window_chunker_gemini_embed_query_pipeline,
    use_rephrasing=False,
    use_reranking=False)


Processing: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?'

--- Starting pipeline for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?' ---
Retrieved 20 raw chunks.
Evaluation Status: ANSWERABLE
Generating answer...
Response:
Based on the provided context, the standard Isolation Forest produces artifacts in anomaly score heat maps that create "artificial zones of higher/lower scores" which are not present in the original data [Extended Isolation Forest, Hariri et al., 2021]. These artifacts are generated by the algorithm's branching procedure, which slices data using hyperplanes parallel to the coordinate axes [Extended Isolation Forest, Hariri et al., 2021].

The specific appearance of the artifact depends on the data's distribution:

*   **For a single cluster of normally distributed data**, the score ma

## Experiment with popular LLM (Mistral-7B-Instruct-v0.3)

In [None]:
class BaselineMistral:
    def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.3", device: str = "cpu"):
        print(f"Loading baseline Model: {model_name}...")

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto"
        )
        print("Baseline model loaded.")

    def generate_answer(self, query: str) -> str:
        system_prompt = """
        You are a strict Research Assistant specializing ONLY in Anomaly Detection and Isolation Forests.

        YOUR RULES:
        1. You answer questions strictly based on technical knowledge of Isolation Forests.
        2. NEGATIVE CONSTRAINT: If the user asks about unrelated topics (like cooking, weather, sports, or general life advice), you must REFUSE.
           - YOU MUST NOT provide the requested information "anyway."
           - YOU MUST NOT say "However, here is..."
           - You simply state: "I cannot answer this question as it is unrelated to Anomaly Detection."
        3. If the user asks a technical question you don't know, say "I do not have sufficient data."
        """

        full_prompt = f"{system_prompt}\n\nUser Question: {query}"

        messages = [
            {"role": "user", "content": full_prompt}
        ]

        input_ids = self.tokenizer.apply_chat_template(
            messages,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.1,
                pad_token_id=self.tokenizer.eos_token_id
            )

        response = self.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
        return response.strip()

In [None]:
baseline_mistral = BaselineMistral(device=DEVICE)

for q_data in questions:
    query = q_data["question"]
    expected_label = q_data["label"]

    print(f"Question: {query}")
    print(f"Expected Category: {expected_label}")
    print("-" * 60)
    print("[BASELINE MISTRAL RESPONSE]")
    baseline_response = baseline_mistral.generate_answer(query)
    print(baseline_response)
    print(f"\n{'='*60}\n")

Loading baseline Model: mistralai/Mistral-7B-Instruct-v0.3...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Baseline model loaded.
Question: What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Isolation Forest aims to fix?
Expected Category: ANSWERABLE
------------------------------------------------------------
[BASELINE MISTRAL RESPONSE]
In standard Isolation Forests, the primary artifact produced is the anomaly score for each data point. This score represents the average number of samples a data point needs to be isolated from the rest of the data.

Extended Isolation Forest aims to improve upon this by providing additional information, such as the distribution of anomaly scores, which can be useful in creating anomaly score heat maps. However, it's important to note that the creation of heat maps is not a built-in feature of Isolation Forests, but rather an external visualization technique that can be applied to the anomaly scores produced by the algorithm.

In the context of your question, neither standard Isolation Forest nor Exten

## Classical informaion retrieval system (BM25)

In [None]:
import fitz
import nltk
import string

from typing import List, Dict, Any
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
class ClassicalParser:
    def parse(self, file_path: str, metadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
        print(f"Parsing {file_path} with Fitz...")
        doc = fitz.open(file_path)
        chunks = []

        base_metadata = metadata if metadata else {}

        for page_num, page in enumerate(doc):
            text = page.get_text("text")

            paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]

            for p in paragraphs:
                chunk_meta = base_metadata.copy()
                chunk_meta["source"] = file_path
                chunk_meta["page"] = page_num + 1

                chunks.append({
                    "text": p,
                    "metadata": chunk_meta
                })

        print(f"Found {len(chunks)} paragraphs.")
        return chunks

In [None]:
class ClassicalRetriever:
    def __init__(self, chunks: List[Dict[str, Any]]):
        self.chunks = chunks
        self.corpus = [chunk["text"] for chunk in chunks]

        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

        print("Building BM25 index (stopwords + lemmatization)...")
        self.tokenized_corpus = [self._tokenize(doc) for doc in self.corpus]
        self.bm25 = BM25Okapi(self.tokenized_corpus)

    def _tokenize(self, text: str) -> List[str]:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)

        clean_tokens = []
        for w in tokens:
            if w not in self.stop_words:
                lemma = self.lemmatizer.lemmatize(w, pos='v')
                clean_tokens.append(lemma)

        return clean_tokens

    def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        tokenized_query = self._tokenize(query)

        print(f"Query tokens: {tokenized_query}")

        scores = self.bm25.get_scores(tokenized_query)

        top_n_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

        results = []
        for i in top_n_indices:
            if scores[i] > 0:
                result_item = self.chunks[i].copy()
                result_item["score"] = scores[i]
                results.append(result_item)
        return results

In [None]:
class ClassicalPipeline:
    def __init__(self, file_paths: List[str], metadatas: List[Dict[str, Any]]):
        self.parser = ClassicalParser()
        self.all_chunks = []

        for path, meta in zip(file_paths, metadatas):
            file_chunks = self.parser.parse(path, metadata=meta)
            self.all_chunks.extend(file_chunks)

        self.retriever = ClassicalRetriever(self.all_chunks)

    def run(self, query: str, top_k: int = 3):
        print(f"\nClassical search for: '{query}'")
        results = self.retriever.retrieve(query, top_k=top_k)

        if not results:
            return "No relevant documents found (0 keyword matches)."

        output = ""
        for i, res in enumerate(results):
            meta = res['metadata']

            title = meta.get('title', 'Unknown Title')
            author = meta.get('authors', 'Unknown Author')
            year = meta.get('year', 'n.d.')

            output += f"--- Result {i+1} (BM25 Score: {res['score']:.2f}) ---\n"
            output += f"Source: [{title}, {author}, {year}]\n"
            output += f"File: {meta['source']} (Page {meta['page']})\n"
            output += f"Content: {res['text'][:300]}...\n\n"

        return output

In [None]:
classical_pipeline = ClassicalPipeline(files, meta)

for q in questions:
    print(classical_pipeline.run(q["question"]))

Parsing ad-papers-pdf/extended_isolation_forest.pdf with Fitz...
Found 11 paragraphs.
Parsing ad-papers-pdf/extended_kmeans_isolation_forest.pdf with Fitz...
Found 15 paragraphs.
Parsing ad-papers-pdf/functional_isolation_forest.pdf with Fitz...
Found 33 paragraphs.
Parsing ad-papers-pdf/generalized_isolation_forest.pdf with Fitz...
Found 9 paragraphs.
Parsing ad-papers-pdf/kernel_isolation_forest.pdf with Fitz...
Found 11 paragraphs.
Parsing ad-papers-pdf/kmeans_isolation_forest.pdf with Fitz...
Found 15 paragraphs.
Parsing ad-papers-pdf/probabilistic_generalization_of_isolation_forest.pdf with Fitz...
Found 17 paragraphs.
Parsing ad-papers-pdf/randomised_choices_in_isolation_forest.pdf with Fitz...
Found 24 paragraphs.
Parsing ad-papers-pdf/scoring_isolation_forest.pdf with Fitz...
Found 7 paragraphs.
Building BM25 index (stopwords + lemmatization)...

Classical search for: 'What specific artifact does the standard Isolation Forest produce in anomaly score heat maps that Extended Iso