In [None]:
from ollama import chat, ChatResponse
import json
import re
from IPython.display import display, Markdown
import urllib.request
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS

# --- Prompt Baselines ---
RESPONSE_PROMPT = (
    "You are an expert on the topic: {research_topic}. Provide an extensive, detailed, and comprehensive answer "
    "to the research question. In your answer, highlight any areas or gaps that might require further exploration."
)

FIND_GAP_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}."
    "Think step by step on the following assay: "
    "<ASSAY>"
    "{assay}"
    "</ASSAY>"
    "Provide one new topic to explore to fill a knowledge gap in the assay."
    "Based on the gaps identified in your answer, generate a JSON object with the following keys:\n"
    '   - "query": "The search query string."\n'
    '   - "web-query": "The web search query string."\n'
    '   - "aspect": "The aspect of the topic being addressed by this query."\n'
    '   - "rationale": "Why this query will help fill the gap."\n'
    "Provide only the JSON structure."
)

COMBINE_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}."
    "You have to combine together the information from the written assay in <ASSAY></ASSAY> tags, "
    "with the information gathered from the results of a web search on peer-reviewed literature within the <WEB></WEB> tags."
    "When combining them together you are allowed to use only the sources identified through the web search."
    "Cite them in the text where appropriate and report them at the bottom."
    "<ASSAY>"
    "{assay}"
    "</ASSAY>"
    "<WEB>"
    "{web_search}"
    "</WEB>"
)

FINALIZE_RESPONSE_PROMPT = (
    "You are a team of expert on the topic: {research_topic}. Your goal is to analyze the text provided in the <TEXT></TEXT> tags "
    "and create an extensive, detailed, and comprehensive report using the information provided. Aim to 500 words per section."
    "Your thesis is formatted in markdown and have:\n"
    "1. Title\n"
    "2. Introduction\n"
    "3. Discussion\n"
    "4. Gaps / Further research\n"
    "<TEXT>"
    "{notes}"
    "</TEXT>"
)

def enhance_query_for_scientific_literature(query: str) -> str:
    """
    Enhance an LLM-generated query to focus on scientific literature by appending 
    academic-specific filters and keywords.
    
    Args:
        query (str): The original query.
        
    Returns:
        str: The enhanced query.
    """
    # Add filters to restrict results to scientific literature domains and add academic keywords.
    filters = "site:pubmed.ncbi.nlm.nih.gov OR site:sciencedirect.com OR site:doi.org peer-reviewed journal"
    return f"{query} {filters}"

def duckduckgo_search(query: str, max_results: int = 5, fetch_full_page: bool = False) -> dict:
    """
    Perform a DuckDuckGo search for the given query.
    Optionally fetch full page content.
    
    Args:
        query (str): The search query.
        max_results (int): Number of results to return.
        fetch_full_page (bool): If True, attempt to retrieve the full page content.
    
    Returns:
        dict: A dictionary with a "results" key containing a list of result dicts.
    """
    results = []
    with DDGS() as ddgs:
        search_results = list(ddgs.text(query, max_results=max_results))
        for r in search_results:
            url = r.get("href")
            title = r.get("title")
            content = r.get("body")
            if not all([url, title, content]):
                continue
            raw_content = content
            if fetch_full_page:
                try:
                    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
                    response = urllib.request.urlopen(req)
                    html = response.read().decode('utf-8', errors='replace')
                    raw_content = BeautifulSoup(html, 'html.parser').get_text()
                except Exception as e:
                    raw_content = content  # fallback to the snippet content
            results.append({
                "title": title,
                "url": url,
                "content": content,
                "raw_content": raw_content
            })
    return {"results": results}

def search_scientific_literature(query: str, num_results: int = 5, fetch_full_page: bool = False) -> dict:
    """
    Enhance the given query for scientific literature and perform a DuckDuckGo search.
    
    Args:
        query (str): The original query.
        num_results (int): Number of results to return.
        fetch_full_page (bool): If True, fetch full page content.
    
    Returns:
        dict: Search results as returned by duckduckgo_search.
    """
    enhanced_query = enhance_query_for_scientific_literature(query)
    print("Enhanced Query:", enhanced_query)
    return duckduckgo_search(enhanced_query, max_results=num_results, fetch_full_page=fetch_full_page)

# --- Helper: Remove <THINK> Tags ---
def remove_think_tags(text: str) -> str:
    """
    Remove any text enclosed in <THINK>...</THINK> tags.
    The regex is case-insensitive.
    """
    return re.sub(r"<\s*THINK\s*>.*?<\s*/\s*THINK\s*>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()

# --- Configuration & State Management ---
class Configuration:
    def __init__(self, ollama_base_url: str, local_llm: str, fetch_full_page: bool,
                 max_research_loops: int, max_fetch_pages: int, max_token_per_search: int):
        self.ollama_base_url = ollama_base_url
        self.local_llm = local_llm
        self.fetch_full_page = fetch_full_page
        self.max_research_loops = max_research_loops
        self.max_fetch_pages = max_fetch_pages
        self.max_token_per_search = max_token_per_search

def query_local_llm(state: dict, config: Configuration, prompt="") -> str:
    """
    Generate an extensive answer for the research topic.
    The answer should also indicate potential gaps for further research.
    """
    message = {"role": "user", "content": prompt}
    response: ChatResponse = chat(model=config.local_llm, messages=[message])
    initial_response = remove_think_tags(response.message.content.strip())
    state["initial_response"] = initial_response
    state["assay"] = initial_response
    return initial_response

def initialize_state(research_topic: str) -> dict:
    """
    Initialize the research state with the given topic.
    """
    return {
        "research_topic": research_topic,
        "initial_response": "",       # The original extensive answer.
        "assay":"",
        "search_query": research_topic
    }

def extract_json_from_llm_output(text: str):
    """
    Attempt to extract a JSON object from the provided text.
    This function supports two formats:
      1. JSON enclosed in triple backticks with the tag "json" (e.g., ```json { ... } ```).
      2. JSON enclosed in triple backticks without the tag (e.g., ``` { ... } ```).
      3. A plain JSON string.
    
    Returns:
        Parsed JSON object.
        
    Raises:
        ValueError if no valid JSON structure can be found or parsed.
    """
    # Define patterns for JSON enclosed in triple backticks.
    patterns = [
        r"```json\s*(\{.*?\})\s*```",  # with "json" tag (case-insensitive)
        r"```(\{.*?\})```"             # without the tag
    ]
    
    matches = []
    for pattern in patterns:
        found = re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE)
        if found:
            matches.extend(found)
    
    # If no triple-backtick JSON is found, check if the whole text is JSON.
    if not matches:
        stripped = text.strip()
        if stripped.startswith("{") and stripped.endswith("}"):
            matches.append(stripped)
    
    if not matches:
        print("<ERROR>:\n%s\n</ERROR>" % text)
        raise ValueError("No JSON structure found in the provided text.")
    
    # Try parsing each candidate.
    for match in matches:
        try:
            json_data = json.loads(match)
            return json_data
        except json.JSONDecodeError:
            continue
    
    print("<ERROR>:\n%s\n</ERROR>" % text)
    raise ValueError("Found JSON-like structure, but could not parse it.")


def main():
    config = Configuration(
        ollama_base_url="http://localhost:11434",  # Your Ollama URL
        local_llm="llama3.2",                      # Default LLM is "llama3.2"
        fetch_full_page=True,                      # Fetch full page content if needed
        max_research_loops=3,                      # Number of research iterations
        max_fetch_pages=5,                         # Number of pages to fetch per search
        max_token_per_search=4000                  # Token limit per search processing
    )

    # Step 1: Get the research question from the user
    research_topic = input("Enter your research question: ")
    print("#### Research Question ####\n")
    print(research_topic)
    print("\n")
    state = initialize_state(research_topic)

    # Step 2: Generate an initial explanation using the local LLM. This answer will be the first instance of the assay
    print("  >> Create initial assay... \n")
    prompt_initial = RESPONSE_PROMPT.format(research_topic=state["research_topic"])
    initial_explanation = query_local_llm(state, config, prompt_initial)
    print(initial_explanation)
    print("\n")

    print("#### Thinking Process ####\n")
    for i in range(config.max_research_loops):
        print(f">> Iteration {i+1}\n")
        print("  >> Find gap... ")
        # Step 3: Evaluate the current assay and generate a follow-up question
        prompt_gap = FIND_GAP_PROMPT.format(research_topic=state["research_topic"], assay=state["assay"])
        followup_question_llm = query_local_llm(state, config, prompt_gap)
        followup_question_json = extract_json_from_llm_output(followup_question_llm)
        print(f"     > Follow-up Question: {followup_question_json["query"]}")
        print(f"     > Web-search query: {followup_question_json["web-query"]}")

        # Step 4: Gather literature sources
        print("  >> Query web-literature... ")
        user_query = followup_question_json["web-query"]
        web_search = search_scientific_literature(user_query, num_results=5, fetch_full_page=True)
        #print("\n--- Search Results ---")
        #print(json.dumps(results, indent=2))

        # Step 5: Reiterate on a new topic
        print("  >> Combine literature to assay... ")
        prompt_follow = COMBINE_PROMPT.format(research_topic=followup_question_json["query"], assay=state["assay"], web_search=web_search)
        follow_explanation = query_local_llm(state, config, prompt_follow)
        print(follow_explanation)
        #print("  >> Generate new response based on follow-up question... \n")
        # Step 4: Reiterate on a new topic
        #prompt_follow = RESPONSE_PROMPT.format(research_topic=followup_question_json["query"])
        #follow_explanation = query_local_llm(state, config, prompt_initial)
        #print("  >> Add new data to notes... \n")
        # Step 5: Add the data to the assay
        state["assay"] =  state["assay"] + follow_explanation

    # print(state["assay"])
    # Step 6: Ask the LLM to finalize the assay by integrating all gathered information and adding references
    prompt_finalize = FINALIZE_RESPONSE_PROMPT.format(research_topic=state["research_topic"], notes=state["assay"] )
    finalize_text_llm = query_local_llm(state, config, prompt_finalize)
    
    print("#### Final Assay ####\n")
    #print(finalize_text_llm)
    display(Markdown(finalize_text_llm))

    # # Step 7: Save the final assay locally
    # try:
    #     with open("final_assay.txt", "w", encoding="utf-8") as f:
    #         f.write(final_assay)
    #     print("\nFinal assay saved to 'final_assay.txt'.")
    # except Exception as e:
    #     print("Error saving the final assay:", e)

if __name__ == "__main__":
    main()

# What is the state of the art of open source LLMs?

#### Research Question ####

use of LLM in bioinformatics


#### Initial Explanation ####

The integration of Large Language Models (LLMs) in bioinformatics has revolutionized the field by enabling researchers to analyze vast amounts of biological data, identify patterns, and make predictions with unprecedented accuracy. Bioinformatics, which is the application of computational tools and statistical techniques to analyze biological data, relies heavily on LLMs for various tasks such as gene prediction, protein structure prediction, functional annotation, and disease diagnosis.

**Applications of LLMs in bioinformatics**

1. **Gene Prediction**: LLMs can be trained on large datasets of annotated genes and used to predict the presence or absence of genes in a given sequence. This is particularly useful for annotating genomes of organisms with incomplete or low-quality genomic data.
2. **Protein Structure Prediction**: LLMs can be used to predict protein structures from amino acid sequenc

DuckDuckGoSearchException: https://lite.duckduckgo.com/lite/ 202 Ratelimit

In [None]:
from ollama import chat, ChatResponse
import json
import re
import time
import urllib.request
import datetime
from IPython.display import display, Markdown
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS

# --- Prompt Baselines ---
RESPONSE_PROMPT = (
    "You are an expert on the topic: {research_topic}. Provide an extensive, detailed, and comprehensive answer "
    "to the research question. In your answer, highlight any areas or gaps that might require further exploration."
)

FIND_GAP_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}. "
    "Think step by step on the following assay: "
    "<ASSAY>\n{assay}\n</ASSAY> "
    "Provide one new topic to explore to fill a knowledge gap in the assay. "
    "Based on the gaps identified in your answer, generate a JSON object with the following keys:\n"
    '   - "query": "The search query string."\n'
    '   - "web-query": "The web search query string."\n'
    '   - "aspect": "The aspect of the topic being addressed by this query."\n'
    '   - "rationale": "Why this query will help fill the gap."\n'
    "Provide only the JSON structure."
)

COMBINE_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}. "
    "You have to combine together the information from the written assay in <ASSAY></ASSAY> tags, "
    "with the information gathered from the results of a web search on peer-reviewed literature within the <WEB></WEB> tags. "
    "When combining them together you are allowed to use only the sources identified through the web search. "
    "Cite them in the text where appropriate and report them at the bottom. "
    "<ASSAY>\n{assay}\n</ASSAY> "
    "<WEB>\n{web_search}\n</WEB>"
)

FINALIZE_RESPONSE_PROMPT = (
    "You are a team of experts on the topic: {research_topic}. Your goal is to analyze the text provided in the <TEXT></TEXT> tags "
    "and create an extensive, detailed, and comprehensive report using the information provided. Aim to 500 words per section. "
    "Your thesis is formatted in markdown and includes:\n"
    "1. Title\n"
    "2. Introduction\n"
    "3. Discussion\n"
    "4. Gaps / Further research\n"
    "<TEXT>\n{notes}\n</TEXT>"
)

# --- Helper: Exponential Backoff for DuckDuckGo Search ---
def duckduckgo_search(query: str, max_results: int = 5, fetch_full_page: bool = False, retries: int = 5, backoff: int = 3) -> dict:
    """
    Perform a DuckDuckGo search for the given query with exponential backoff in case of rate limiting.
    
    Args:
        query (str): The search query.
        max_results (int): Number of results to return.
        fetch_full_page (bool): If True, attempt to retrieve full page content.
        retries (int): Maximum number of retry attempts.
        backoff (int): Base backoff delay in seconds.
        
    Returns:
        dict: A dictionary with a "results" key containing a list of result dicts.
    
    Raises:
        Exception if maximum retries are exceeded.
    """
    from duckduckgo_search import DDGS
    for attempt in range(retries):
        try:
            results = []
            with DDGS() as ddgs:
                search_results = list(ddgs.text(query, max_results=max_results))
                for r in search_results:
                    url = r.get("href")
                    title = r.get("title")
                    content = r.get("body")
                    if not all([url, title, content]):
                        continue
                    raw_content = content
                    if fetch_full_page:
                        try:
                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
                            response = urllib.request.urlopen(req)
                            html = response.read().decode('utf-8', errors='replace')
                            raw_content = BeautifulSoup(html, 'html.parser').get_text()
                        except Exception as e:
                            raw_content = content  # fallback to snippet content
                    results.append({
                        "title": title,
                        "url": url,
                        "content": content,
                        "raw_content": raw_content
                    })
            return {"results": results}
        except Exception as e:
            if "Ratelimit" in str(e):
                wait_time = backoff ** (attempt + 1)
                print(f"Rate limit encountered. Retrying in {wait_time} seconds... (Attempt {attempt+1} of {retries})")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("Exceeded maximum retries due to rate limiting.")

def enhance_query_for_scientific_literature(query: str) -> str:
    """
    Enhance an LLM-generated query to focus on scientific literature by appending 
    academic-specific filters and keywords.
    
    Args:
        query (str): The original query.
        
    Returns:
        str: The enhanced query.
    """
    filters = "site:pubmed.ncbi.nlm.nih.gov OR site:sciencedirect.com OR site:doi.org peer-reviewed journal"
    return f"{query} {filters}"

def search_scientific_literature(query: str, num_results: int = 5, fetch_full_page: bool = False) -> dict:
    """
    Enhance the given query for scientific literature and perform a DuckDuckGo search.
    
    Args:
        query (str): The original query.
        num_results (int): Number of results to return.
        fetch_full_page (bool): If True, fetch full page content.
    
    Returns:
        dict: Search results as returned by duckduckgo_search.
    """
    enhanced_query = enhance_query_for_scientific_literature(query)
    print("Enhanced Query:", enhanced_query)
    return duckduckgo_search(enhanced_query, max_results=num_results, fetch_full_page=fetch_full_page)

# --- Helper: Remove <THINK> Tags ---
def remove_think_tags(text: str) -> str:
    """
    Remove any text enclosed in <THINK>...</THINK> tags.
    The regex is case-insensitive.
    """
    return re.sub(r"<\s*THINK\s*>.*?<\s*/\s*THINK\s*>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()

# --- Configuration & State Management ---
class Configuration:
    def __init__(self, ollama_base_url: str, local_llm: str, fetch_full_page: bool,
                 max_research_loops: int, max_fetch_pages: int, max_token_per_search: int):
        self.ollama_base_url = ollama_base_url
        self.local_llm = local_llm
        self.fetch_full_page = fetch_full_page
        self.max_research_loops = max_research_loops
        self.max_fetch_pages = max_fetch_pages
        self.max_token_per_search = max_token_per_search

def initialize_state(research_topic: str) -> dict:
    """
    Initialize the research state with the given topic.
    """
    return {
        "research_topic": research_topic,
        "initial_response": "",       # The original extensive answer.
        "assay": "",                  # To hold the current assay.
        "search_query": research_topic
    }

def query_local_llm(state: dict, config: Configuration, prompt="") -> str:
    """
    Generate an answer for the research topic using the local LLM.
    The answer should also indicate potential gaps.
    """
    message = {"role": "user", "content": prompt}
    response: ChatResponse = chat(model=config.local_llm, messages=[message])
    output = remove_think_tags(response.message.content.strip())
    state["initial_response"] = output  # For initial response
    state["assay"] = output             # Initialize assay with the output
    return output

def extract_json_from_llm_output(text: str):
    """
    Attempt to extract a JSON object from the provided text.
    Supports:
      1. JSON enclosed in triple backticks with tag "json".
      2. JSON enclosed in triple backticks without tag.
      3. A plain JSON string.
    
    Returns:
        Parsed JSON object.
        
    Raises:
        ValueError if no valid JSON structure is found.
    """
    patterns = [
        r"```json\s*(\{.*?\})\s*```",  # with "json" tag
        r"```(\{.*?\})```"             # without tag
    ]
    matches = []
    for pattern in patterns:
        found = re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE)
        if found:
            matches.extend(found)
    if not matches:
        stripped = text.strip()
        if stripped.startswith("{") and stripped.endswith("}"):
            matches.append(stripped)
    if not matches:
        print("<ERROR>:\n%s\n</ERROR>" % text)
        raise ValueError("No JSON structure found in the provided text.")
    for match in matches:
        try:
            json_data = json.loads(match)
            return json_data
        except json.JSONDecodeError:
            continue
    print("<ERROR>:\n%s\n</ERROR>" % text)
    raise ValueError("Found JSON-like structure, but could not parse it.")

# --- Main Research Pipeline ---
def main():
    config = Configuration(
        ollama_base_url="http://localhost:11434",  # Your Ollama URL
        local_llm="llama3.2",                      # Default LLM is "llama3.2"
        fetch_full_page=True,                      # Fetch full page content if needed
        max_research_loops=3,                      # Number of research iterations
        max_fetch_pages=5,                         # Number of pages to fetch per search
        max_token_per_search=4000                  # Token limit per search processing
    )

    # Step 1: Get the research question from the user.
    research_topic = input("Enter your research question: ")
    print("#### Research Question ####\n")
    print(research_topic)
    print("\n")
    state = initialize_state(research_topic)

    # Step 2: Generate an initial explanation (assay) using the local LLM.
    print("  >> Create initial assay...\n")
    prompt_initial = RESPONSE_PROMPT.format(research_topic=state["research_topic"])
    initial_explanation = query_local_llm(state, config, prompt_initial)
    print(initial_explanation)
    print("\n")

    print("#### Thinking Process ####\n")
    for i in range(config.max_research_loops):
        print(f">> Iteration {i+1}\n")
        print("  >> Find gap...")
        # Step 3: Evaluate the current assay and generate a follow-up question.
        prompt_gap = FIND_GAP_PROMPT.format(research_topic=state["research_topic"], assay=state["assay"])
        followup_question_llm = query_local_llm(state, config, prompt_gap)
        followup_question_json = extract_json_from_llm_output(followup_question_llm)
        print(f"     > Follow-up Question: {followup_question_json['query']}")
        print(f"     > Web-search query: {followup_question_json['web-query']}")

        # Step 4: Gather literature sources.
        print("  >> Query web-literature...")
        user_query = followup_question_json["web-query"]
        web_search = search_scientific_literature(user_query, num_results=5, fetch_full_page=True)
        
        # Step 5: Combine literature with the assay.
        print("  >> Combine literature to assay...")
        prompt_follow = COMBINE_PROMPT.format(research_topic=followup_question_json["query"], assay=state["assay"], web_search=web_search)
        follow_explanation = query_local_llm(state, config, prompt_follow)
        print(follow_explanation)
        # Update the assay by appending the new follow explanation.
        state["assay"] = state["assay"] + follow_explanation

    # Step 6: Finalize the assay.
    prompt_finalize = FINALIZE_RESPONSE_PROMPT.format(research_topic=state["research_topic"], notes=state["assay"])
    finalize_text_llm = query_local_llm(state, config, prompt_finalize)
    
    print("#### Final Assay ####\n")
    display(Markdown(finalize_text_llm))

if __name__ == "__main__":
    main()


#### Research Question ####

How to combine quantum physics and bioinformatics


  >> Create initial assay...

Combining Quantum Physics and Bioinformatics: A Comprehensive Overview

Quantum physics has revolutionized our understanding of the fundamental laws governing the behavior of matter and energy at the smallest scales. Similarly, bioinformatics has transformed the way we analyze and interpret biological data. By integrating these two seemingly disparate fields, researchers can uncover new insights into the intricate workings of living systems. This comprehensive review aims to provide an in-depth exploration of the intersections between quantum physics and bioinformatics.

**Quantum Physics in Bioinformatics**

Bioinformatics is the application of computational tools and statistical methods to analyze and interpret biological data. Quantum physics, on the other hand, provides a framework for understanding the behavior of matter at the atomic and subatomic level. By applying prin

Exception: Exceeded maximum retries due to rate limiting.