In [2]:
import unittest
import tempfile
import shutil
from pathlib import Path
import sys
import os
import uuid
sys.path.append("../../")
from lits.clients.pdf_client import PDFClient
from lits.tools.pdf_tools import PDFQueryTool
from qdrant_client.models import Distance, VectorParams, PointStruct

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
temp_dir = "temp/"

client = PDFClient(storage_path=temp_dir)
url = "https://drapubcdnprd.azureedge.net/publicregister/attachments/gqruz/groundwater_map_GQR001042_epa.pdf"

### Add a new PDF file

In [None]:
pdf_content = client._download_pdf(url)
text = client._parse_pdf(pdf_content)
chunks = client._chunk_text(text)


In [7]:
print(text)

EPA Victoria is committed to providing content that is accessible to the widest possible audience, regardless of technology or ability.
This map may not meet our minimum WCAG AA accessibility standards. Contact us if you need this information in an accessible
format. Contact email: environmental.audit@epa.vic.gov.au
Disclaimer: This map shows the approximation of an area with groundwater quality impacts from human activities. It does not
provide information on the naturally occurring quality of groundwater, which can also restrict its use. Most zones are based on land
parcel boundaries. Future changes to land parcel boundaries do not change the location of the zone.
98
112
260-262
272-280
264-270
6-12
80
37-39
90-96
1
3
1
3 2
57
67
69
65
59
61
71
63
53
55
73
81
77
79
75
6
4
8
69
245-251
59
44-54
32-58
COM, Vicmap, Esri, HERE, Garmin, USGS
¯
Map
generated on:
GQR001042EPA reference:
Environmental audit site
Groundwater zone with restricted uses
Properties
Unit/house/building number
Road

In [None]:

# Generate embeddings
embeddings = client.encoder.encode(chunks, show_progress_bar=False)



points = []
for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{url}#{idx}"))
    points.append(
        PointStruct(
            id=point_id,
            vector=embedding.tolist(),
            payload={
                "url": url,
                "chunk_index": idx,
                "text": chunk,
            },
        )
    )
# Upload to Qdrant
client.qdrant.upsert(
    collection_name=client.collection_name,
    points=points,
)


### Fetch by URL

In [36]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

def test_fetch_by_url(client: PDFClient, url: str, point_id: str | None = None):
    """
    Fetch indexed chunks from Qdrant for a given URL.
    
    Args:
        client: PDFClient instance with configured Qdrant client.
        url (str): The PDF URL to look up.
        point_id (str | None): Optional specific chunk UUID. If provided,
                               return only that point.
    Returns:
        list of dicts: [{ 'id': ..., 'chunk_index': ..., 'text': ... }]
    """

    # Build filter conditions
    conditions = [FieldCondition(key="url", match=MatchValue(value=url))]

    if point_id is not None:
        # Qdrant filter by ID: need to wrap inside a payload OR ID match filter
        conditions.append(FieldCondition(key="id", match=MatchValue(value=point_id)))

    query_filter = Filter(must=conditions)

    # Perform search — we don't need a query vector if just filtering
    results = client.qdrant.scroll(
        collection_name=client.collection_name,
        scroll_filter=query_filter,
        limit=200,  # enough for typical PDF chunk count
    )

    points, _next_page = results

    extracted = []
    for p in points:
        extracted.append({
            "id": p.id,
            "chunk_index": p.payload.get("chunk_index"),
            "text": p.payload.get("text"),
        })

    return extracted


In [37]:
extracted =test_fetch_by_url(client, url)

### Search by Query

In [50]:
client.url_cache

set()

In [None]:
query_embedding = client.encoder.encode("guide me to extend CiT to other search algorithms", show_progress_bar=False)

In [43]:

from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
query_filter = Filter(
    must=[
        FieldCondition(
            key="url",
            match=MatchValue(value=url)
        )
    ]
)
results = client.qdrant.search(
            collection_name=client.collection_name,
            query_vector=query_embedding.tolist(),
            query_filter=query_filter,
            limit=5,
        )


        
# Format results
chunks = []
for result in results:
    chunks.append({
        "text": result.payload["text"],
        "chunk_index": result.payload["chunk_index"],
        "score": result.score,
    })

  results = client.qdrant.search(


In [44]:
chunks

[{'text': 'theoretically guarantees non-increasing pol-\nicy cost and achieves up to 85% runtime reduction\nacross ToT-BS, ReST-MCTS, and RAP without\naccuracy loss. Overall, CiT emphasizes the impor-\ntance of accurate BN evaluation for scaling LLM-\nbased search.\n6.1 Reproducibility\nTo support reproducibility, we release an open-\nsource Python package that modularizes LLM-\nprofiled roles across various search frameworks,\nalong with scripts for running all experiments and\nevaluations.',
  'chunk_index': 72,
  'score': 0.5477410744580397},
 {'text': 'ripts for running all experiments and\nevaluations. The chaining phase in CiT is imple-\nmented as a single Python function that is univer-\nsally applicable across all three frameworks used\nin our experiments. Details are provided in Ap-\npendix J.\nFor transparency, we additionally provide eval-\nuation datasets, detailed logs of LLM generations\nfor each role and instance, JSON files for recon-\nstructing search trees, and per-in

### Tool Testing

In [4]:
tool = PDFQueryTool(client=client)

print(tool.name)
print(tool.description)
print(tool.args_schema)
retrieved_content = tool._run(url=url, query="What is the title of this paper?")




query_pdf
Query a PDF document from a URL and retrieve relevant content. Input: PDF URL and a search query. Output: Relevant text passages from the PDF that match the query. The PDF is automatically downloaded and indexed on first use.
<class 'lits.tools.pdf_tools.PDFQueryInput'>


In [6]:
print(retrieved_content)

PDF: https://arxiv.org/pdf/2509.25835.pdf
Query: What is the title of this paper?
Found 3 relevant passages:

--- Passage 1 (score: 0.369) ---
Linguistics.
Lukas Chrpa and Mauro Vallati. 2022. Planning with
critical section macros: theory and practice.Journal
of Artificial Intelligence Research, 74:691–732.
Ning Dai, Zheng Wu, Renjie Zheng, Ziyun Wei, Wenlei
Shi, Xing Jin, Guanlin Liu, Chen Dun, Liang Huang,
and Lin Yan. 2025. Process supervision-guided pol-
icy optimization for code generation.
Arthur Guez, David Silver, and Peter Dayan. 2012.
Efficient bayes-adaptive reinforcement learning us-
ing sample-based search.

--- Passage 2 (score: 0.277) ---
inyu Zhao, Xi Ye, Kyle Mahowald, and
Greg Durrett. 2025. To cot or not to cot? chain-of-
thought helps mainly on math and symbolic reason-
ing. InThe Thirteenth International Conference on
Learning Representations.
Ilya Sutskever, Oriol Vinyals, and Quoc V Le. 2014.
Sequence to sequence learning with neural networks.
Advances in neural 