# Curate Web Datasets with Parallel to store in Weaviate

In [None]:
import os
from parallel import Parallel

client = Parallel(api_key=os.environ["PARALLEL_API_KEY"])

query = "What is the most recent academic research advancements in web search for LLMs?"

task_run = client.task_run.create(
    input=query,
    processor="ultra"
)
print(f"Task Created. Run ID: {task_run.run_id}")

run_result = client.task_run.result(task_run.run_id, api_timeout=3600)
print(run_result.output)

Task Created. Run ID: trun_813763841ce54ca889b9a51e0964712e


In [8]:
print(run_result.output.__dict__.keys())

dict_keys(['basis', 'content', 'type', 'beta_fields', 'output_schema'])


In [17]:
print(run_result.output.content.keys())

dict_keys(['report_summary', 'clarification_of_scope', 'timeline_of_milestones', 'rag_advancements_analysis', 'specific_rag_techniques', 'agentic_llm_evolution', 'representative_agentic_systems', 'llm_powered_search_engine_integration', 'major_industry_implementations', 'influential_architectural_and_model_trends', 'evaluation_benchmarks_and_metrics', 'security_and_robustness_challenges', 'mitigation_strategies_for_risks', 'market_and_publisher_ecosystem_impact', 'user_experience_and_cognitive_impacts', 'open_problems_and_future_research_directions'])


In [15]:
# Join the dict keys with newline, each followed by its content (stringified)
joined_keys_and_content = "\n".join(
    f"{key}:\n{run_result.output.content[key]}\n" for key in run_result.output.content.keys()
)
print(joined_keys_and_content)

report_summary:
Academic research in web search for Large Language Models (LLMs) from late 2023 to late 2025 has been characterized by rapid evolution and the convergence of three core areas: Retrieval-Augmented Generation (RAG), LLM-powered search engines, and autonomous LLM agents. This period saw a paradigm shift from static information retrieval to dynamic, agentic reasoning. RAG architectures evolved from basic retrieval to advanced, self-correcting, and adaptive systems like Self-RAG and CRAG, which learn to critique and refine retrieved information. The introduction of GraphRAG and multimodal RAG (e.g., OmniSearch) further expanded capabilities to handle structured and non-textual data. Concurrently, 2025 was widely recognized as the 'year of agents,' marked by a surge in research using Reinforcement Learning (RL) to train LLMs to autonomously plan, use tools (including web search), and synthesize information (e.g., R1-Searcher, Search-R1). Major industry players like Google (AI

In [16]:
import tiktoken

# Count the number of tokens in run_result.output using tiktoken's default GPT-3.5 encoder
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoder.encode(joined_keys_and_content))
print(f"Number of tokens in run_result.output: {num_tokens}")


Number of tokens in run_result.output: 11776


In [20]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=os.getenv("WEAVIATE_URL"),
    auth_credentials=Auth.api_key(os.getenv("WEAVIATE_API_KEY"))
)

            Please make sure to close the connection using `client.close()`.
  weaviate_client = weaviate.connect_to_weaviate_cloud(


In [21]:
web_research_reports = weaviate_client.collections.create(
    name="WebResearchReports",
    description="Web research reports on LLMs",
    vector_config=Configure.Vectors.text2vec_weaviate(),
    properties=[
        Property(name="content", data_type=DataType.TEXT),
        Property(name="query", data_type=DataType.TEXT),
    ],
)

In [24]:
web_research_reports.data.insert(
    properties={
        "query": query,
        "content": joined_keys_and_content,
    }
)

UUID('d55bb4f7-990d-45eb-88f5-316cac32ce6a')

# Chat with Query Agent

In [26]:
from weaviate.agents.query import QueryAgent

query_agent = QueryAgent(
    client=weaviate_client,
    collections=["WebResearchReports", "IRPapersText_Default"]
)

response = query_agent.ask(
    "What is the overlap of recent advances in Information Retrieval applied to Web Search?"
)

print(response.final_answer)

Recent advances in Information Retrieval (IR) applied to Web Search, especially in the context of Large Language Models (LLMs) from late 2023 to 2025, reveal a transformative shift from traditional static retrieval toward dynamic, agentic reasoning and integration of generative AI techniques. The overlap of cutting-edge IR and Web Search is characterized by several key converging areas:

1. **Retrieval-Augmented Generation (RAG) Evolution:**  
   RAG systems combine LLMs with external knowledge sources, particularly web content, to ground responses in real-time, factual data. There has been a progression from basic retrieval models to modular, self-correcting, and agentic RAG systems. Key innovations include:  
   - **Self-RAG:** LLMs learn to trigger on-demand retrieval and critique their own generations to improve factuality and reduce hallucinations.  
   - **Corrective RAG (CRAG):** Incorporates retrieval quality evaluators that prompt corrective web searches when necessary.  
   -