# Exploration for Agent Specialized on Information Retrieval

In [None]:
%pip install smolagents duckduckgo-search wikipedia-api docling rank_bm25 transformers google-api-python-client 'pymilvus[model]' torch sentence_transformers hf_xet
%pip install devtools

from dotenv import load_dotenv

load_dotenv()

## Search Tools Assessment

`DuckDuckGoSearchTool` is part of default tools in smolagents package. 

In [None]:
from smolagents import DuckDuckGoSearchTool

search_tools = DuckDuckGoSearchTool()
search_results = search_tools("Mercedes Sosa")
print(search_results)

Note: Due to rate limits, DuckDuckGo was not effective for solving GAIA tasks. However, the agents were still able to find the correct answers despite occasional tool failures.

In [None]:
from googleapiclient.discovery import build
import os
from devtools import pprint

developer_key = os.getenv("GOOGLE_SEARCH_API_KEY")
if not developer_key:
    raise ValueError("Please set the GOOGLE_SEARCH_API_KEY environment variable.")
cx = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
if not developer_key:
    raise ValueError("Please set the GOOGLE_SEARCH_ENGINE_ID environment variable.")

service = build("customsearch", "v1", developerKey=developer_key)
res = (
    service.cse()
    .list(
        q="Mercedes Sosa",
        cx=cx,
        # fields="items(title,link,snippet)",
        # siteSearch="wikipedisa.org",
        # siteSearchFilter="i",
        num=2,
        sort="date:r:20000101:20091231",
    )
    .execute()
)

out = "\n\n".join(
    [f"[{item['title']}]({item['link']})\n{item['snippet']}" for item in res["items"]]
)
print(out)

In [None]:
from smolagents import Tool


class GoogleSearchTool(Tool):
    name = "web_search"
    description = """Performs a google web search for query then returns top search results in markdown format."""
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform search.",
        },
    }
    output_type = "string"

    skip_forward_signature_validation = True

    def __init__(
        self,
        api_key: str | None = None,
        search_engine_id: str | None = None,
        num_results: int = 10,
        **kwargs,
    ):
        api_key = api_key if api_key is not None else os.getenv("GOOGLE_SEARCH_API_KEY")
        if not api_key:
            raise ValueError(
                "Please set the GOOGLE_SEARCH_API_KEY environment variable."
            )
        search_engine_id = (
            search_engine_id
            if search_engine_id is not None
            else os.getenv("GOOGLE_SEARCH_ENGINE_ID")
        )
        if not search_engine_id:
            raise ValueError(
                "Please set the GOOGLE_SEARCH_ENGINE_ID environment variable."
            )

        self.cse = build("customsearch", "v1", developerKey=api_key).cse()
        self.cx = search_engine_id
        self.num = num_results
        super().__init__(**kwargs)

    def _collect_params(self) -> dict:
        return {}

    def forward(self, query: str, *args, **kwargs) -> str:
        params = {
            "q": query,
            "cx": self.cx,
            "fields": "items(title,link,snippet)",
            "num": self.num,
        }

        params = params | self._collect_params(*args, **kwargs)

        response = self.cse.list(**params).execute()
        if "items" not in response:
            return "No results found."

        result = "\n\n".join(
            [
                f"[{item['title']}]({item['link']})\n{item['snippet']}"
                for item in response["items"]
            ]
        )
        return result


class GoogleSiteSearchTool(GoogleSearchTool):
    name = "site_search"
    description = """Performs a google search within the website for query then returns top search results in markdown format."""
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform search.",
        },
        "site": {
            "type": "string",
            "description": "The domain of the site on which to search.",
        },
    }

    def _collect_params(self, site: str) -> dict:
        return {
            "siteSearch": site,
            "siteSearchFilter": "i",
        }


class GoogleTimeRestrictedSearchTool(GoogleSearchTool):
    name = "web_search_in_date_range"
    description = """Performs a Google search with a date range filter and returns top results formatted in markdown."""
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform search.",
        },
        "start": {
            "type": "string",
            "description": "Specifies the earliest date for search results in `YYYYMMDD` format. Filters out content published before this date.",
        },
        "end": {
            "type": "string",
            "description": "Specifies the latest date for search results in YYYYMMDD format. Filters out content published after this date.",
        },
    }

    def _collect_params(self, start: str, end: str) -> dict:
        return {"sort": f"date:r:{start}:{end}"}

In [None]:
web_search = GoogleSearchTool(num_results=2)
r = search_results("Mercedes Sosa")
print(r)

print("\n-----\n")

site_search = GoogleSiteSearchTool(num_results=2)
r = site_search("Mercedes Sosa", "wikipedia.org")
print(r)

print("\n-----\n")

time_restricted_search = GoogleTimeRestrictedSearchTool(num_results=2)
r = time_restricted_search("Mercedes Sosa", "20000101", "20091231")
print(r)


In [None]:
from smolagents import WikipediaSearchTool

search_tools = WikipediaSearchTool(content_type="summary")
search_results = search_tools("Mercedes Sosa")
print(search_results)

## Web Page Visit Tool Assessment

In [None]:
from smolagents import VisitWebpageTool

visit_web_page = VisitWebpageTool(1000000)
page = visit_web_page("https://en.wikipedia.org/wiki/Mercedes_Sosa")
print(f"Length retrieved: {len(page)}")
print(page)

Note: to get page full content `max_output_length` was increased.

Default tool uses `markdownify`. Let's check if `docling` will produce better result.

## Document Retrieval Tool Implementation

### Simple Documents Retrieval

In [None]:
from docling.document_converter import DocumentConverter

source = "https://en.wikipedia.org/wiki/Mercedes_Sosa"  # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
page = result.document.export_to_markdown()
print(f"Length retrieved: {len(page)}")
print(result.document.export_to_markdown())

`docling` produce more clean result for wikipedia pages. To use with an agent we can wrap it in tool. At this point we are not going to worry about context length and will add it later if needed.

In [None]:
from smolagents import Tool
from docling.document_converter import DocumentConverter


class ContentRetrieverTool(Tool):
    name = "retrieve_content"
    description = """Retrieve the content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
    inputs = {
        "url": {
            "type": "string",
            "description": "The URL or local path of the webpage or document to retrieve.",
        }
    }
    output_type = "string"

    def __init__(
        self,
        **kwargs,
    ):
        self.document_converter = DocumentConverter()
        super().__init__(**kwargs)

    def forward(self, url: str) -> str:
        """
        Retrieve the content of a webpage or document. Supports PDF, DOCX, XLSX, HTML, images, and more.
        Args:
            url (str):
        Returns:
            str: The content of the webpage or document in markdown format.
        """
        result = self.document_converter.convert(url)
        content = result.document.export_to_markdown()
        return content

Tool verification:

In [None]:
retrieve_content = ContentRetrieverTool()
content = retrieve_content("https://en.wikipedia.org/wiki/Mercedes_Sosa")
print(f"Length retrieved: {len(content)}")


Agent with web basic navigation:

In [None]:
from smolagents import ToolCallingAgent, OpenAIServerModel

model = OpenAIServerModel(model_id="gpt-4.1")
agent = ToolCallingAgent(
    model=model,
    tools=[
        GoogleSearchTool(),
        WikipediaSearchTool(),
        ContentRetrieverTool(),
    ],
    verbosity_level=2,
)

agent.run("""
How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
""")

Agent works but consumes to much tokens for page content which affect instructions following. We need to provide ability to focus on relevant content.

In [None]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer, logging
from rank_bm25 import BM25Okapi
from devtools import pprint

document_converter = DocumentConverter()

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
)
chunker = HybridChunker(tokenizer=tokenizer)

transformers_logging_verbosity = logging.get_verbosity()
logging.set_verbosity_error()
document = document_converter.convert(
    "https://en.wikipedia.org/wiki/Mercedes_Sosa"
).document
chunks_iterator = chunker.chunk(dl_doc=document)
logging.set_verbosity(transformers_logging_verbosity)

chunks = []
tokenized_document = []
tokenize = tokenizer.get_tokenizer().tokenize
for chunk in chunks_iterator:
    chunk_with_context = chunker.contextualize(chunk=chunk)
    chunk_tokenized = tokenize(chunk_with_context)
    chunks.append(chunk_with_context)
    tokenized_document.append(chunk_tokenized)

bm25 = BM25Okapi(tokenized_document)
query = "studio album"
tokenized_query = tokenize(query)
relevant_chunks = bm25.get_top_n(tokenized_query, chunks)
print("\n\n".join(relevant_chunks))


In [None]:
bm25_scores = bm25.get_scores(tokenized_query)
pprint(bm25_scores.tolist())

In [None]:
import numpy as np

for i in np.argsort(-bm25_scores):
    print(f"Chunk {i} BM25 score: {bm25_scores[i]}")
    print(chunks[i])
    print("\n\n")

In [None]:
import numpy as np

beta = 1.0  # < 1.0 â‡’ smoother; > 1.0 â‡’ sharper
# --- soft-max (with optional temperature) ---
shifted = beta * bm25_scores - np.max(beta * bm25_scores)
probs = np.exp(shifted)
probs /= probs.sum()  # soft-max probabilities, sum = 1
# --- keep docs whose individual prob â‰¥ threshold ---
cum_idx = np.argsort(probs)[::-1]  # indices sorted by prob, desc
cum_probs = probs[cum_idx].cumsum()
cut = np.searchsorted(cum_probs, 0.8)  # first pos where cumulative â‰¥ 80 %
selected = cum_idx[: cut + 1].tolist()
relevant_chunks = [(chunks[i], bm25_scores[i]) for i in selected]
pprint(relevant_chunks)

From [docling](https://docling-project.github.io/docling/examples/hybrid_chunking/#setup) documentation:

> ðŸ‘‰ NOTE: As you see above, using the HybridChunker can sometimes lead to a warning from the transformers library, however this is a "false alarm" â€” for details check [here](https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model).

`bm25` score has not probability nature and this make soft max output not relevant and hard to find balance between completeness of information and noise. Let's try more advanced RAG implementation.

In [None]:
from pymilvus import MilvusClient, model
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HierarchicalChunker
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

client = MilvusClient("./data/milvus.db")
# model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_fn = model.DefaultEmbeddingFunction()

if client.has_collection(collection_name="foo"):
    client.drop_collection(collection_name="foo")
client.create_collection(
    collection_name="foo",
    dimension=768,  # model.get_sentence_embedding_dimension(),
    # metric_type="COSINE",
)

document_converter = DocumentConverter()
document = document_converter.convert(
    "https://en.wikipedia.org/wiki/Mercedes_Sosa"
).document
chunker = HierarchicalChunker()
chunks_iterator = chunker.chunk(dl_doc=document)

chunks = [chunker.contextualize(chunk) for chunk in list(chunks_iterator)]
# vectors = model.encode(chunks, normalize_embeddings=True)
vectors = embedding_fn.encode_documents(chunks)
data = [{"id": i, "vector": vectors[i], "text": chunks[i]} for i in range(len(vectors))]
client.insert(collection_name="foo", data=data)

query_texts = ["List of studio albums by Mercedes Sosa"]
# query_vectors = model.encode(query_texts, normalize_embeddings=True)
query_vectors = embedding_fn.encode_queries(query_texts)
res = client.search(
    collection_name="foo",  # target collection
    data=query_vectors,  # query vectors
    limit=100,  # number of returned entities
    output_fields=["text"],  # specifies fields to be returned
)

for r in res[0]:
    print(f"ID: {r.id}, distance: {r.distance}")
    print(r.entity["text"])
    print("\n\n")

Experiments with Chrome and Milvus does not brought expected result on the example page.

In [None]:
from docling.document_converter import DocumentConverter
from docling.chunking import HierarchicalChunker
from sentence_transformers import SentenceTransformer, util
import torch

document_converter = DocumentConverter()
model = SentenceTransformer("all-MiniLM-L6-v2")
chunker = HierarchicalChunker()

document = document_converter.convert(
    "https://en.wikipedia.org/wiki/Mercedes_Sosa"
    # "https://www.baseball-reference.com/teams/NYY/1977.shtml"
).document

chunks = list(chunker.chunk(dl_doc=document))
chunks_text = [chunk.text for chunk in chunks]
chunks_with_context = [chunker.contextualize(chunk) for chunk in chunks]
chunks_context = [
    chunks_with_context[i].replace(chunks_text[i], "").strip()
    for i in range(len(chunks))
]

chunk_embeddings = model.encode(chunks_text, convert_to_tensor=True)
context_embeddings = model.encode(chunks_context, convert_to_tensor=True)
query_embedding = model.encode(["studio albums"], convert_to_tensor=True)

threshold = 0.2
selected_indices = []  # aggregate indexes across chunks and context matches and for all queries
for embeddings in [
    context_embeddings,
    chunk_embeddings,
]:
    # Compute cosine similarities (returns 1D tensor)
    for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
        # Convert to softmax probabilities
        probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
        # Sort by probability descending
        sorted_indices = torch.argsort(probabilities, descending=True)
        # Accumulate until total probability reaches threshold

        cumulative = 0.0
        for i in sorted_indices:
            cumulative += probabilities[i].item()
            selected_indices.append(i.item())
            if cumulative >= threshold:
                break

selected_indices = list(
    dict.fromkeys(selected_indices)
)  # remove duplicates and preserve order
selected_indices = selected_indices[
    ::-1
]  # make most relevant items last for better focus
for idx in selected_indices:
    print(chunks_with_context[idx], "\n\n")

Now we are going to rewrite our tool:

In [None]:
from smolagents import Tool
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HierarchicalChunker
from sentence_transformers import SentenceTransformer, util
import torch


class ContentRetrieverTool(Tool):
    name = "retrieve_content"
    description = """Retrieve the content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
    inputs = {
        "url": {
            "type": "string",
            "description": "The URL or local path of the webpage or document to retrieve.",
        },
        "query": {
            "type": "string",
            "description": "The subject on the page you are looking for. The shorter the more relevant content is returned.",
        },
    }
    output_type = "string"

    def __init__(
        self,
        model_name: str | None = None,
        threshold: float = 0.2,
        **kwargs,
    ):
        self.threshold = threshold
        self._document_converter = DocumentConverter()
        self._model = SentenceTransformer(
            model_name if model_name is not None else "all-MiniLM-L6-v2"
        )
        self._chunker = HierarchicalChunker()

        super().__init__(**kwargs)

    def forward(self, url: str, query: str) -> str:
        document = self._document_converter.convert(url).document

        chunks = list(self._chunker.chunk(dl_doc=document))
        if len(chunks) == 0:
            return "No content found."

        chunks_text = [chunk.text for chunk in chunks]
        chunks_with_context = [self._chunker.contextualize(chunk) for chunk in chunks]
        chunks_context = [
            chunks_with_context[i].replace(chunks_text[i], "").strip()
            for i in range(len(chunks))
        ]

        chunk_embeddings = self._model.encode(chunks_text, convert_to_tensor=True)
        context_embeddings = self._model.encode(chunks_context, convert_to_tensor=True)
        query_embedding = self._model.encode(
            [term.strip() for term in query.split(",") if term.strip()], 
            convert_to_tensor=True
        )

        selected_indices = []  # aggregate indexes across chunks and context matches and for all queries
        for embeddings in [
            context_embeddings,
            chunk_embeddings,
        ]:
            # Compute cosine similarities (returns 1D tensor)
            for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
                # Convert to softmax probabilities
                probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
                # Sort by probability descending
                sorted_indices = torch.argsort(probabilities, descending=True)
                # Accumulate until total probability reaches threshold

                cumulative = 0.0
                for i in sorted_indices:
                    cumulative += probabilities[i].item()
                    selected_indices.append(i.item())
                    if cumulative >= self.threshold:
                        break

        selected_indices = list(
            dict.fromkeys(selected_indices)
        )  # remove duplicates and preserve order
        selected_indices = selected_indices[
            ::-1
        ]  # make most relevant items last for better focus
        
        if len(selected_indices) == 0:
            return "No content found."

        return "\n\n".join(
            [chunks_with_context[idx] for idx in selected_indices]
        )


retrieve_content = ContentRetrieverTool()

## Agent Solving Task with Wikipedia Search

Recreate agent with improved tool:

In [None]:
from smolagents import (
    ToolCallingAgent,
    OpenAIServerModel,
    # WikipediaSearchTool,
)

model = OpenAIServerModel(model_id="gpt-4.1")
agent = ToolCallingAgent(
    model=model,
    tools=[
        # WikipediaSearchTool(content_type="summary"),
        GoogleSearchTool(),
        GoogleSiteSearchTool(),
        GoogleTimeRestrictedSearchTool(),
        ContentRetrieverTool(),
    ],
    planning_interval=3,
    max_steps=10,
    verbosity_level=2,
)

In [None]:
agent.run("""
How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
""")

## Agent Solving Web Search Task

In [None]:
agent.run("""\
How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?
""")

In [None]:
agent.run("""\
What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.
""")

In [None]:
agent.run("""\
Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023?
Give them to me in the form Pitcher Before, Pitcher
After, use their last names only, in Roman characters
""")

## Agent Solving Papers Search Task

In [None]:
agent.run("""\
On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?
""")