In [None]:
from ollama import chat, ChatResponse
import json
import re
import time
import urllib.request
import datetime
from IPython.display import display, Markdown
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS

# --- Prompt Baselines ---
RESPONSE_PROMPT = (
    "You are an expert on the topic: {research_topic}. Provide an extensive, detailed, and comprehensive answer "
    "to the research question. In your answer, highlight any areas or gaps that might require further exploration."
)

FIND_GAP_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}. "
    "Think step by step about the following assay:\n"
    "<ASSAY>\n{assay}\n</ASSAY>\n\n"
    "Identify one new topic to explore that will help fill a knowledge gap in the assay. "
    "Based on the gaps you identified, generate a JSON object with exactly the following keys:\n"
    '   - "query": The search query string.\n'
    '   - "web-query": The web search query string.\n'
    '   - "aspect": The aspect of the topic being addressed by this query.\n'
    '   - "rationale": A brief explanation of why this query will help fill the gap.\n\n'
    "Please output only the JSON object, with no additional text. For example:\n\n"
    "```json\n"
    "{\n"
    '  "query": "example search query",\n'
    '  "web-query": "example web search query",\n'
    '  "aspect": "example aspect",\n'
    '  "rationale": "example rationale"\n'
    "}\n"
    "```"
)

COMBINE_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}."
    "You have to combine together the information from the written assay in <ASSAY></ASSAY> tags, "
    "with the information gathered from the results of a web search on peer-reviewed literature within the <WEB></WEB> tags. "
    "When combining them together you are allowed to use only the sources identified through the web search. "
    "Cite them in the text where appropriate and report them at the bottom. "
    "<ASSAY>\n{assay}\n</ASSAY> "
    "<WEB>\n{web_search}\n</WEB>"
)

FINALIZE_RESPONSE_PROMPT = (
    "You are a team of experts on the topic: {research_topic}. Your goal is to analyze the text provided in the <TEXT></TEXT> tags "
    "and create an extensive, detailed, and comprehensive report using the information provided. Aim to 500 words per section. "
    "Your thesis is formatted in markdown and includes:\n"
    "1. Title\n"
    "2. Introduction\n"
    "3. Discussion\n"
    "4. Gaps / Further research\n"
    "<TEXT>\n{notes}\n</TEXT>"
)

# --- Helper: Exponential Backoff in DuckDuckGo Search ---
def duckduckgo_search(query: str, max_results: int = 5, fetch_full_page: bool = False, retries: int = 5, backoff: int = 3) -> dict:
    """
    Perform a DuckDuckGo search for the given query with exponential backoff in case of rate limiting.
    
    Args:
        query (str): The search query.
        max_results (int): Number of results to return.
        fetch_full_page (bool): If True, attempt to retrieve the full page content.
        retries (int): Maximum number of retry attempts.
        backoff (int): Base backoff delay in seconds.
        
    Returns:
        dict: A dictionary with a "results" key containing a list of result dicts.
    
    Raises:
        Exception: If maximum retries are exceeded due to rate limiting.
    """
    for attempt in range(retries):
        try:
            results = []
            with DDGS() as ddgs:
                search_results = list(ddgs.text(query, max_results=max_results))
                for r in search_results:
                    url = r.get("href")
                    title = r.get("title")
                    content = r.get("body")
                    if not all([url, title, content]):
                        continue
                    raw_content = content
                    if fetch_full_page:
                        try:
                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
                            response = urllib.request.urlopen(req)
                            html = response.read().decode('utf-8', errors='replace')
                            raw_content = BeautifulSoup(html, 'html.parser').get_text()
                        except Exception as e:
                            raw_content = content  # fallback to snippet
                    results.append({
                        "title": title,
                        "url": url,
                        "content": content,
                        "raw_content": raw_content
                    })
            return {"results": results}
        except Exception as e:
            if "Ratelimit" in str(e):
                wait_time = backoff ** (attempt + 1)
                print(f"Rate limit encountered. Retrying in {wait_time} seconds... (Attempt {attempt+1} of {retries})")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("Exceeded maximum retries due to rate limiting.")

def enhance_query_for_scientific_literature(query: str) -> str:
    """
    Enhance an LLM-generated query to focus on scientific literature by appending 
    academic-specific filters and keywords.
    
    Args:
        query (str): The original query.
        
    Returns:
        str: The enhanced query.
    """
    filters = "site:pubmed.ncbi.nlm.nih.gov OR site:sciencedirect.com OR site:doi.org peer-reviewed journal"
    return f"{query} {filters}"

def search_scientific_literature(query: str, num_results: int = 5, fetch_full_page: bool = False) -> dict:
    """
    Enhance the given query for scientific literature and perform a DuckDuckGo search.
    
    Args:
        query (str): The original query.
        num_results (int): Number of results to return.
        fetch_full_page (bool): If True, fetch full page content.
    
    Returns:
        dict: Search results as returned by duckduckgo_search.
    """
    enhanced_query = enhance_query_for_scientific_literature(query)
    print("Enhanced Query:", enhanced_query)
    return duckduckgo_search(enhanced_query, max_results=num_results, fetch_full_page=fetch_full_page)

# --- Helper: Remove <THINK> Tags ---
def remove_think_tags(text: str) -> str:
    """
    Remove any text enclosed in <THINK>...</THINK> tags.
    The regex is case-insensitive.
    """
    return re.sub(r"<\s*THINK\s*>.*?<\s*/\s*THINK\s*>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()

# --- Configuration & State Management ---
class Configuration:
    def __init__(self, ollama_base_url: str, local_llm: str, fetch_full_page: bool,
                 max_research_loops: int, max_fetch_pages: int, max_token_per_search: int):
        self.ollama_base_url = ollama_base_url
        self.local_llm = local_llm
        self.fetch_full_page = fetch_full_page
        self.max_research_loops = max_research_loops
        self.max_fetch_pages = max_fetch_pages
        self.max_token_per_search = max_token_per_search

def initialize_state(research_topic: str) -> dict:
    """
    Initialize the research state with the given topic.
    """
    return {
        "research_topic": research_topic,
        "initial_response": "",       # The original extensive answer.
        "assay": "",                  # To hold the current assay.
        "search_query": research_topic
    }

def query_local_llm(state: dict, config: Configuration, prompt="") -> str:
    """
    Generate an answer for the research topic using the local LLM.
    The answer should also indicate potential gaps for further research.
    """
    message = {"role": "user", "content": prompt}
    response: ChatResponse = chat(model=config.local_llm, messages=[message])
    output = remove_think_tags(response.message.content.strip())
    state["initial_response"] = output
    state["assay"] = output
    return output

def extract_json_from_llm_output(text: str):
    """
    Attempt to extract a JSON object from the provided text.
    This function supports:
      1. JSON enclosed in triple backticks with the tag "json".
      2. JSON enclosed in triple backticks without the tag.
      3. A plain JSON string.
    
    Returns:
        Parsed JSON object.
        
    Raises:
        ValueError if no valid JSON structure is found.
    """
    patterns = [
        r"```json\s*(\{.*?\})\s*```",  # with "json" tag
        r"```(\{.*?\})```"             # without tag
    ]
    matches = []
    for pattern in patterns:
        found = re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE)
        if found:
            matches.extend(found)
    if not matches:
        stripped = text.strip()
        if stripped.startswith("{") and stripped.endswith("}"):
            matches.append(stripped)
    if not matches:
        print("<ERROR>:\n%s\n</ERROR>" % text)
        raise ValueError("No JSON structure found in the provided text.")
    for match in matches:
        try:
            json_data = json.loads(match)
            return json_data
        except json.JSONDecodeError:
            continue
    print("<ERROR>:\n%s\n</ERROR>" % text)
    raise ValueError("Found JSON-like structure, but could not parse it.")

# --- Main Research Pipeline ---
def main():
    config = Configuration(
        ollama_base_url="http://localhost:11434",  # Your Ollama URL
        local_llm="llama3.2",                      # Default LLM is "llama3.2"
        fetch_full_page=True,                      # Fetch full page content if needed
        max_research_loops=3,                      # Number of research iterations
        max_fetch_pages=5,                         # Number of pages to fetch per search
        max_token_per_search=4000                  # Token limit per search processing
    )

    # Step 1: Get the research question from the user.
    research_topic = input("Enter your research question: ")
    print("#### Research Question ####\n")
    print(research_topic)
    print("\n")
    state = initialize_state(research_topic)

    # Step 2: Generate an initial explanation (assay) using the local LLM.
    print("  >> Create initial assay...\n")
    prompt_initial = RESPONSE_PROMPT.format(research_topic=state["research_topic"])
    initial_explanation = query_local_llm(state, config, prompt_initial)
    print(initial_explanation)
    print("\n")

    print("#### Thinking Process ####\n")
    for i in range(config.max_research_loops):
        print(f">> Iteration {i+1}\n")
        print("  >> Find gap... ")
        # Step 3: Evaluate the current assay and generate a follow-up question.
        prompt_gap = FIND_GAP_PROMPT.format(research_topic=state["research_topic"], assay=state["assay"])
        followup_question_llm = query_local_llm(state, config, prompt_gap)
        followup_question_json = extract_json_from_llm_output(followup_question_llm)
        print(f"     > Follow-up Question: {followup_question_json['query']}")
        print(f"     > Web-search query: {followup_question_json['web-query']}")

        # Step 4: Gather literature sources.
        print("  >> Query web-literature... ")
        user_query = followup_question_json["web-query"]
        web_search = search_scientific_literature(user_query, num_results=5, fetch_full_page=True)
        
        # Step 5: Combine literature with the assay.
        print("  >> Combine literature to assay...\n")
        prompt_follow = COMBINE_PROMPT.format(research_topic=followup_question_json["query"], assay=state["assay"], web_search=web_search)
        follow_explanation = query_local_llm(state, config, prompt_follow)
        print(follow_explanation)
        # Update the assay by appending the new follow explanation.
        state["assay"] = state["assay"] + follow_explanation
        print("  ########## ")

    # Step 6: Finalize the assay by integrating all gathered information.
    prompt_finalize = FINALIZE_RESPONSE_PROMPT.format(research_topic=state["research_topic"], notes=state["assay"])
    finalize_text_llm = query_local_llm(state, config, prompt_finalize)
    
    print("#### Final Assay ####\n")
    display(Markdown(finalize_text_llm))

if __name__ == "__main__":
    main()


#### Research Question ####

How to evaluate scientific research


  >> Create initial assay...

Evaluating Scientific Research: A Comprehensive Guide

Scientific research is a critical component of advancing knowledge in various fields, but evaluating its quality and validity can be a daunting task for many individuals. This comprehensive guide aims to provide an in-depth understanding of how to evaluate scientific research, highlighting areas that may require further exploration.

**I. Understanding the Research Question**

Before evaluating scientific research, it's essential to understand the research question or hypothesis being investigated. A clear and concise research question should be well-defined, specific, and measurable. It should also be relevant to the field of study and aligned with existing knowledge.

**II. Literature Review**

A comprehensive literature review is a critical component of any scientific study. It provides an overview of existing knowledge on the topic,

# Optimization of Catalysts for Hydrogen Production in Artificial Intelligence Assisted Nanotechnology Systems
=====================================================

## Introduction

The production of hydrogen using nanotechnology systems has gained significant attention in recent years due to the increasing demand for clean and sustainable energy sources. The optimization of catalysts is crucial for improving the efficiency and effectiveness of hydrogen production in these systems. This report aims to evaluate the current state of research on the use of artificial intelligence (AI) in optimizing catalysts for hydrogen production in nanotechnology systems.

## Discussion

The use of AI in optimizing catalysts for hydrogen production in nanotechnology systems has significant potential for improving efficiency and effectiveness in chemical reactions. According to Fayyazi et al. (2023), AI-driven optimization systems can analyze data from various sensors in the vehicle, such as fuel cell temperature, pressure, and power output, to optimize the performance of hydrogen fuel cell stacks in real-time (65).

The optimization of heat, hydrogen, and raised temperature in the electrolytic systems with feed factors contributes to improved power system safety and efficiency. Unlike batteries, which are mostly made of raw materials, the most expensive element of a fuel cell is making the fuel cell stack itself—not the ingredients required to build it.

AI-driven tools can also be used to accelerate the screening of chemical space and increase the efficiency of catalyst development. Scientists can now combine AI and robotics to rationalize the screening of chemical space and reduce the number of potential combinations that need to be studied (1).

In addition, the use of AI in optimizing renewable hydrogen systems is crucial for achieving ambitious targets set by countries such as the European Union, China, and the United States. The rising level of CO2 has driven a profound transformation of the global energy landscape, and many countries have set targets to become carbon neutral by 2050 or earlier.

Furthermore, solar PV hydrogen systems rely on photovoltaic modeling to assess the energy available for hydrogen production. However, measured data is not always available and needs to be sampled over a sufficiently long period to be representative of the particular location (6).

The use of AI in optimizing catalysts for hydrogen production in nanotechnology systems has significant potential for improving efficiency and effectiveness in chemical reactions. The integration of AI with other technologies such as robotics and photovoltaic modeling can further enhance the efficiency of hydrogen production.

## Gaps / Further Research

Despite the significant potential of AI in optimizing catalysts for hydrogen production, there are still several gaps that need to be addressed through further research. Some of these gaps include:

*   The lack of standardization in the optimization of catalysts for hydrogen production, which can make it difficult to compare results and develop best practices.
*   The limited availability of data on the performance of different catalysts under various conditions, which can limit the ability of AI systems to optimize catalyst design.
*   The need for further research on the use of AI in optimizing renewable hydrogen systems, particularly in terms of integrating AI with other technologies such as photovoltaic modeling and robotics.

To address these gaps, further research is needed to develop more sophisticated AI systems that can optimize catalyst design and improve the efficiency and effectiveness of hydrogen production. This may involve the integration of multiple data sources and the development of more advanced algorithms for optimizing catalyst performance.

## Conclusion

In conclusion, the use of AI in optimizing catalysts for hydrogen production in nanotechnology systems has significant potential for improving efficiency and effectiveness in chemical reactions. However, further research is needed to address the gaps in current knowledge and develop more sophisticated AI systems that can optimize catalyst design and improve the efficiency and effectiveness of hydrogen production.

By addressing these gaps, researchers can develop more effective solutions for optimizing catalysts for hydrogen production, which can help to reduce greenhouse gas emissions and promote the adoption of clean energy sources.