In [2]:
from ollama import chat, ChatResponse
import json
import re
from IPython.display import display, Markdown

# --- Prompt Baselines ---
INITIAL_RESPONSE_PROMPT = (
    "You are an expert on the topic: {research_topic}. Provide an extensive, detailed, and comprehensive answer "
    "to the research question. In your answer, highlight any areas or gaps that might require further exploration."
)

FIND_GAP_PROMPT = (
    "You are a group of 3 experts on the topic: {research_topic}."
    "Think step by step on the following assay: "
    "<ASSAY>"
    "{assay}"
    "</ASSAY>"
    "Provide one new topic to explore to fill a knowledge gap in the assay."
    "Based on the gaps identified in your answer, generate a JSON object with the following keys:\n"
    '   - "query": "The search query string."\n'
    '   - "aspect": "The aspect of the topic being addressed by this query."\n'
    '   - "rationale": "Why this query will help fill the gap."\n'
    "Provide only the JSON structure."
)

FINALIZE_RESPONSE_PROMPT = (
    "You are a team of expert on the topic: {research_topic}. Your goal is to analyze the text provided in the <TEXT></TEXT> tags "
    "and create an extensive, detailed, and comprehensive report using the information provided. Aim to 500 words per section."
    "Your thesis is formatted in markdown and have:\n"
    "1. Title\n"
    "2. Introduction\n"
    "3. Discussion\n"
    "4. Gaps / Further research\n"
    "<TEXT>"
    "{notes}"
    "</TEXT>"
)

# --- Helper: Remove <THINK> Tags ---
def remove_think_tags(text: str) -> str:
    """
    Remove any text enclosed in <THINK>...</THINK> tags.
    The regex is case-insensitive.
    """
    return re.sub(r"<\s*THINK\s*>.*?<\s*/\s*THINK\s*>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()

# --- Configuration & State Management ---
class Configuration:
    def __init__(self, ollama_base_url: str, local_llm: str, fetch_full_page: bool,
                 max_research_loops: int, max_fetch_pages: int, max_token_per_search: int):
        self.ollama_base_url = ollama_base_url
        self.local_llm = local_llm
        self.fetch_full_page = fetch_full_page
        self.max_research_loops = max_research_loops
        self.max_fetch_pages = max_fetch_pages
        self.max_token_per_search = max_token_per_search

def query_local_llm(state: dict, config: Configuration, prompt="") -> str:
    """
    Generate an extensive answer for the research topic.
    The answer should also indicate potential gaps for further research.
    """
    message = {"role": "user", "content": prompt}
    response: ChatResponse = chat(model=config.local_llm, messages=[message])
    initial_response = remove_think_tags(response.message.content.strip())
    state["initial_response"] = initial_response
    state["assay"] = initial_response
    return initial_response

def initialize_state(research_topic: str) -> dict:
    """
    Initialize the research state with the given topic.
    """
    return {
        "research_topic": research_topic,
        "initial_response": "",       # The original extensive answer.
        "assay":"",
        "search_query": research_topic
    }

def extract_json_from_llm_output(text):
    # Regular expression pattern to match JSON within triple backticks
    pattern = r'```json(\s*{.*?}\s*)```'

    # Find matches using DOTALL to match across multiple lines
    matches = re.findall(pattern, text, re.DOTALL)

    if not matches:
        print ("<ERROR>:\n%s\n</ERROR>"%text)
        raise ValueError("No JSON structure found in the provided text.")

    # Iterate through matches and attempt to parse as JSON
    for match in matches:
        try:
            json_data = json.loads(match)
            return json_data
        except json.JSONDecodeError as e:
            # If parsing fails, continue to the next match
            continue

    # If no valid JSON was parsed, raise an error
    print ("<ERROR>:\n%s\n</ERROR>"%text)
    raise ValueError("Found JSON-like structure, but could not parse it.")

def main():
    config = Configuration(
        ollama_base_url="http://localhost:11434",  # Your Ollama URL
        local_llm="deepseek-r1:8b",                      # Default LLM is "llama3.2"
        fetch_full_page=True,                      # Fetch full page content if needed
        max_research_loops=3,                      # Number of research iterations
        max_fetch_pages=5,                         # Number of pages to fetch per search
        max_token_per_search=4000                  # Token limit per search processing
    )

    # Step 1: Get the research question from the user
    research_topic = input("Enter your research question: ")
    print("#### Research Question ####\n")
    print(research_topic)
    print("\n")
    state = initialize_state(research_topic)

    # Step 2: Generate an initial explanation using the local LLM. This answer will be the first instance of the assay
    prompt_initial = INITIAL_RESPONSE_PROMPT.format(research_topic=state["research_topic"])
    initial_explanation = query_local_llm(state, config, prompt_initial)
    print("#### Initial Explanation ####\n")
    print(initial_explanation)
    print("\n")

    print("#### Thinking Process ####\n")
    for i in range(config.max_research_loops):
        print("  >> Find gap... \n")
        # Step 3: Evaluate the current assay and generate a follow-up question
        prompt_gap = FIND_GAP_PROMPT.format(research_topic=state["research_topic"], assay=state["assay"])
        followup_question_llm = query_local_llm(state, config, prompt_gap)
        followup_question_json = extract_json_from_llm_output(followup_question_llm)
        print(f"  >> Iteration {i+1} - Follow-up Question: {followup_question_json["query"]}\n")

        print("  >> Generate new response based on follow-up question... \n")
        # Step 4: Reiterate on a new topic
        prompt_follow = INITIAL_RESPONSE_PROMPT.format(research_topic=followup_question_json["query"])
        follow_explanation = query_local_llm(state, config, prompt_initial)
        print("  >> Add new data to notes... \n")
        # Step 5: Add the data to the assay
        state["assay"] =  state["assay"] + follow_explanation

    # print(state["assay"])
    # Step 6: Ask the LLM to finalize the assay by integrating all gathered information and adding references
    prompt_finalize = FINALIZE_RESPONSE_PROMPT.format(research_topic=state["research_topic"], notes=state["assay"] )
    finalize_text_llm = query_local_llm(state, config, prompt_finalize)
    
    print("#### Final Assay ####\n")
    #print(finalize_text_llm)
    display(Markdown(finalize_text_llm))

    # # Step 7: Save the final assay locally
    # try:
    #     with open("final_assay.txt", "w", encoding="utf-8") as f:
    #         f.write(final_assay)
    #     print("\nFinal assay saved to 'final_assay.txt'.")
    # except Exception as e:
    #     print("Error saving the final assay:", e)

if __name__ == "__main__":
    main()

# What is the state of the art of open source LLMs?

#### Research Question ####

how to use LLM in bioinformatics


#### Initial Explanation ####

**The Application and Challenges of Large Language Models (LLMs) in Bioinformatics**

**Introduction:**
Large Language Models (LLMs) represent a cutting-edge technology with potential applications across diverse fields, including bioinformatics. This field leverages computational methods to analyze biological data, making LLMs an intriguing tool for enhancing research efficiency.

**Potential Applications of LLMs in Bioinformatics:**

1. **Literature Review and Summarization:**
   - LLMs can efficiently summarize vast amounts of scientific literature, aiding researchers by extracting key information on specific genes or proteins.
   
2. **Protein Interaction Prediction:**
   - By analyzing text data, LLMs may infer protein interactions, although they require training on biological context for accuracy.

3. **Drug Discovery Assistance:**
   - LLMs can potentially aid in identifying compounds i

**Answer:**

Large Language Models (LLMs) are increasingly being utilized in bioinformatics to enhance research efficiency and accuracy through various applications and tools. Here's a structured overview of their current and potential impact:

### Current Applications:
1. **Text Analysis and Literature Processing:**
   - LLMs can summarize research papers, saving researchers time and providing quick insights.
   - They assist in generating hypotheses by linking biological concepts, potentially leading to new research ideas.

2. **Data Interpretation and Analysis:**
   - Complex datasets such as gene expression profiles are analyzed, aiding in identifying patterns and important genes.
   - Integration of multi-omics data (genomics, proteomics, metabolomics) may uncover novel connections not easily visible otherwise.

3. **Automation of Lab Processes:**
   - LLMs support efficient workflow management by aiding in lab experiment planning and outcome prediction, reducing manual task burden.

4. **Personalized Medicine:**
   - Use of patient-specific data for tailored diagnosis and treatment suggestions, offering more individualized insights compared to traditional methods.

5. **Drug Discovery:**
   - Prediction of potential drug candidates based on biological interactions, potentially accelerating the process of finding drug leads.

6. **Pathway Curation and Experiment Interpretation:**
   - Assistance in generating and organizing biological knowledge, aiding hypothesis generation and experimental planning.

### Challenges:
- **Data Quality and Curation:** Crucial for model reliability; flawed data can lead to misleading outputs.
- **Interpretability:** Understanding model decisions is necessary for trust and effective integration into research workflows.
- **Bias in Training Data:** Risk of biased outcomes due to incomplete or skewed training datasets.
- **Ethical Considerations:** Addressing transparency, accountability, and equitable access to ensure fair use of AI tools.

### Future Directions:
1. **Integration with Other AI Techniques:**
   - Combining LLMs with reinforcement learning or attention mechanisms may enhance their capabilities, creating more sophisticated tools.

2. **Improving Interpretability:**
   - Developing techniques to clarify model decision processes will foster trust and confidence in their use.

3. **Literature Review Assistance:**
   - Facilitating systematic reviews and meta-analyses through data extraction and summarization can aid literature-based research.

4. **Interdisciplinary Collaboration:**
   - Bridging gaps between biologists and computer scientists via unified interfaces will support more effective tool development and utilization.

### Conclusion:
While LLMs hold significant potential in bioinformatics, overcoming challenges related to data quality, interpretability, bias, and ethics is essential for their effective use. Addressing these issues will drive advancements and applications in the field, ultimately benefiting research and innovation.