In [68]:
import os
import uuid
import json
import base64
from google.cloud import aiplatform
from google.oauth2 import service_account

import vertexai
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part

from pydantic import BaseModel
from typing import Optional, List

from dotenv import load_dotenv

In [85]:
load_dotenv()

api_key = os.getenv("GOOGLE_API_KEY")

SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

credentials_base64 = os.getenv("SERVICE_ACCOUNT_CREDENTIALS_BASE64")
decoded_credentials = base64.b64decode(credentials_base64).decode('utf-8')
credentials_info = json.loads(decoded_credentials)
credentials = service_account.Credentials.from_service_account_info(credentials_info)

aiplatform.init(credentials=credentials, project='rbio-p-datasharing')
vertexai.init(project="rbio-p-datasharing", location="us-west1")
model = GenerativeModel("gemini-1.5-pro")


In [148]:
# Step 1: Craft a detailed outline
def generate_outline(topic: str, model: GenerativeModel) -> str:
    prompt = f"""
    Generate a detailed knowledgebase article outline in a professional and informative tone on the topic: '{topic}'. 
    The outline should follow this structure:

    Overview:
    - Introduce the topic.
    - Provide a concise explanation of its significance and impact on health or society.
    - Include key statistics or facts for context.

    Key Facts:
    - Highlight notable statistics, prevalence, demographics, or other relevant data as a table or bullet points.

    Symptoms:
    - List common signs and symptoms individuals may experience.

    Types:
    - Describe different types or classifications with brief explanations.

    Causes:
    - Explain the primary causes and underlying mechanisms.

    Risk Factors:
    - Identify lifestyle, genetic, or environmental factors that increase likelihood of occurrence.

    Diagnosis:
    - Outline the diagnostic process, including medical history, tests, or tools used.

    Prevention:
    - Offer practical advice on reducing risk factors or preventing onset.

    Specialist to Visit:
    - Specify the types of medical specialists to consult for proper evaluation and treatment.

    Treatment:
    - Discuss conventional and advanced medical treatments, devices, or therapies available.

    Home-Care:
    - Provide tips on managing the condition at home to improve quality of life.

    Living With:
    - Share advice on long-term management and adapting to life with the condition.

    Complications:
    - Highlight potential health issues or challenges that may arise if untreated.

    Alternative Therapies:
    - Briefly discuss non-conventional treatments that may complement mainstream care.

    FAQs:
    - Add frequently asked questions to address common queries and provide clarity on misconceptions.

    References:
    - Include references or credible sources for generating the content.

    Style Requirements:
    - Maintain a professional yet approachable tone.
    - Include hypertext links to relevant sources where appropriate.
    - Use concise paragraphs and bullet points for easy readability.
    - Where possible, include statistics, research findings, or notable insights to make the article credible and informative.

    Provide a structured outline adhering to this format. Only respond with the knowledgebase article output. Do not add any additional commentary outside of the article.
    """
    response = model.generate_content(prompt)
    return response.text


In [149]:
# Step 2: Refine the outline with an UpToDate article
def refine_outline_with_uptodate(
    topic: str, 
    outline: str, 
    uptodate_article: str, 
    model: GenerativeModel
) -> str:
    """
    Refine the given outline by incorporating relevant information and citations
    from an UpToDate article in markdown format.
    """
    prompt = f"""
    Refine the outline for the knowledgebase article on the topic: '{topic}'.
    

    TASK:
    - Review the provided existing outline and the new information from the UpToDate article.
    - Enhance the outline by seamlessly integrating additional insights, data, and citations from the UpToDate article.
    - Retain all existing details, links, and formatting from the outline, ensuring no information is removed or altered unless it is being updated with more accurate information.
    - Do not remove existing hyperlinks or substitute them with vague placeholders (e.g., "link provided in the original outline").
    - Maintain the markdown structure and professional tone throughout the refined outline.


    
    EXISTING OUTLINE:
    {outline}

    
    UPTODATE ARTICLE:
    {uptodate_article}


    REQUIREMENTS:
    - Add new information where it is relevant, ensuring it complements and enhances existing content.
    - Use the same citation style as the existing outline for any new references (e.g., include inline links or properly formatted references in the References section).
    - Ensure the overall outline remains cohesive, concise, and easy to read.
    - Do not add redundant information or alter the focus of any section.
    - Return only the revised outline in markdown format with no additional commentary or notes.

    OUTPUT FORMAT:
    - A refined outline in markdown format.
    - Ensure all existing and new references are properly integrated without introducing vague placeholders.
    """
    response = model.generate_content(prompt)
    return response.text


In [None]:
from pydantic import BaseModel, Field, ValidationError
from typing import List, Dict
import json

class SearchQuery(BaseModel):
    section: str = Field(..., description="The section of the outline the query corresponds to.")
    query: str = Field(..., description="The search query for this section.")


# Step 3: Generate search queries
def generate_search_query_response(outline: str, model: GenerativeModel) -> str:
    """
    Generate raw JSON response for search queries based on the provided outline.

    Args:
        outline (str): The detailed outline to generate queries for.
        model (GenerativeModel): The generative model instance for processing.

    Returns:
        str: The raw JSON response from the model.
    """
    prompt = f"""
    You are tasked with generating search queries to find corroborating evidence for key claims in a knowledgebase article.
    The goal is to identify relevant scientific papers to support and enhance the article, ensuring credibility and depth.

    TASK:
    - Review the provided outline of the knowledgebase article.
    - Identify areas or claims that would benefit from further evidence or scientific backing.
    - For each identified section, create a search query targeting relevant scientific papers or data.

    REQUIREMENTS:
    1. Return the search queries in strict JSON format.
    2. Each query must include:
        - 'section': The section of the outline the query corresponds to.
        - 'query': A specific search term designed to find relevant papers or abstracts.
    3. Use simple, standalone search terms or phrases. Avoid logical operators like `AND`, `OR`, or quotation marks.


    GUIDELINES:
    - Tailor queries to address gaps in evidence or provide additional insights for key claims in the article.
    - Ensure search terms are specific enough to yield meaningful results.
    - Avoid overly generic queries that may return irrelevant data.

    OUTPUT FORMAT:
    - Return a list of JSON objects, with each object containing the fields 'section' and 'query'.
    - Example:
        [
            {{"section": "Overview", "query": "Global impact of mosquito-borne diseases"}},
            {{"section": "Symptoms", "query": "Large local reactions to mosquito bites and immune response"}}
        ]

    ARTICLE OUTLINE:
    {outline}

    IMPORTANT:
    - Focus on generating precise and targeted queries to find corroborating evidence.
    - Do not include additional commentary or responses outside the JSON format.
    """
    response = model.generate_content(prompt)
    return response.text

    

def parse_search_queries(response_text: str) -> List[SearchQuery]:
    """
    Parse the raw JSON response from the model into a list of SearchQuery objects.

    Args:
        response_text (str): The raw JSON response from the model.

    Returns:
        List[SearchQuery]: A list of validated SearchQuery objects.

    Raises:
        ValueError: If the response cannot be parsed or validated.
    """
    try:
        # Strip markdown code block markers if present
        if response_text.startswith("```") and response_text.endswith("```"):
            response_text = response_text.strip("```json").strip("```").strip()

        # Parse the JSON response
        queries_json = json.loads(response_text)

        # Validate each query using Pydantic
        queries = [SearchQuery(**query) for query in queries_json]

        return queries
    except (json.JSONDecodeError, ValidationError, ValueError) as e:
        raise ValueError(f"Error parsing or validating search queries: {e}")




In [151]:
example_json = [
    {"section": "Overview", "query": "Impact of gut microbiota on mental health"},
    {"section": "Key Facts", "query": "Prevalence of gut microbiota imbalances"},
    {"section": "Causes", "query": "Factors influencing gut microbiota dysbiosis"}
]

In [152]:
import time
import requests
from typing import List, Dict
from rich import print as rprint
import json
from xml.etree import ElementTree as ET

class PubMedAPI:
    def __init__(self, api_key: str, sleep_time: float = 1.0):
        """
        Initialize the PubMedAPI class.

        Args:
            api_key (str): Your PubMed API key.
            sleep_time (float): Time to wait between API requests to avoid rate-limiting.
        """
        self.api_key = api_key
        self.sleep_time = sleep_time
        self.search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def query(self, queries: List[Dict[str, str]]) -> Dict[str, List[Dict[str, str]]]:
        """
        Query the PubMed API using a list of queries and fetch detailed information.

        Args:
            queries (List[Dict[str, str]]): List of queries with sections and query text.

        Returns:
            Dict[str, List[Dict[str, str]]]: Results for each section with detailed metadata.
        """
        results = {}

        for query in queries:
            section = query["section"]
            search_query = query["query"]
            search_params = {
                "db": "pubmed",
                "term": search_query,
                "retmax": 10,  # Limit to 10 results
                "api_key": self.api_key,
                "retmode": "json",
            }
            search_response = requests.get(self.search_url, params=search_params)
            if search_response.status_code == 200:
                search_data = search_response.json()
                pmids = search_data.get("esearchresult", {}).get("idlist", [])
                if pmids:
                    # Fetch detailed metadata for the PMIDs
                    fetch_params = {
                        "db": "pubmed",
                        "id": ",".join(pmids),
                        "retmode": "xml",
                        "api_key": self.api_key,
                    }
                    time.sleep(self.sleep_time)
                    fetch_response = requests.get(self.fetch_url, params=fetch_params)
                    if fetch_response.status_code == 200:
                        results[section] = self._parse_response(fetch_response.text, search_query)
                    else:
                        print(f"Error fetching details for '{search_query}': {fetch_response.status_code}")
                        results[section] = []
                else:
                    print(f"No results found for query: '{search_query}'")
                    results[section] = []
            else:
                print(f"Error querying PubMed for '{search_query}': {search_response.status_code}")
                results[section] = []
            time.sleep(self.sleep_time)

        return results

    def _parse_response(self, xml_response: str, search_query: str) -> List[Dict[str, str]]:
        """
        Parse the PubMed XML response to extract relevant details.

        Args:
            xml_response (str): XML response from PubMed efetch.
            search_query (str): The search query associated with the response.

        Returns:
            List[Dict[str, str]]: Parsed results with title, abstract, authors, DOI, URLs, and other metadata.
        """
        root = ET.fromstring(xml_response)
        articles = []

        for article in root.findall(".//PubmedArticle"):
            pmid = article.findtext(".//PMID")
            title = article.findtext(".//ArticleTitle")
            abstract = article.findtext(".//Abstract/AbstractText")

            # Extract authors
            authors = [
                f"{author.findtext('LastName')} {author.findtext('ForeName')}"
                for author in article.findall(".//Author")
                if author.findtext("LastName") and author.findtext("ForeName")
            ]

            # Extract journal info
            journal = article.findtext(".//Journal/Title")
            publication_date = article.findtext(".//PubDate/Year") or article.findtext(".//PubDate/MedlineDate")

            # Extract DOI
            doi = None
            for id_elem in article.findall(".//ArticleId"):
                if id_elem.attrib.get("IdType") == "doi":
                    doi = id_elem.text
                    break

            # Construct the URL from PMID
            pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"

            # Add the paper details
            articles.append({
                "query": search_query,
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "authors": authors,
                "journal": journal,
                "publication_date": publication_date,
                "doi": doi,
                "pubmed_url": pubmed_url,
            })

        return articles

    def format_citation(self, paper: Dict) -> str:
        """
        Format the citation for a paper using available metadata from PubMed.

        Args:
            paper (Dict): The paper metadata returned by PubMed.

        Returns:
            str: Formatted citation with URL or DOI.
        """
        authors = ", ".join(paper.get("authors", [])[:3])
        if len(paper.get("authors", [])) > 3:
            authors += " et al."

        title = paper.get("title", "Unknown Title")
        year = paper.get("publication_date", "Unknown Year")
        journal = paper.get("journal", "Unknown Journal")
        doi = paper.get("doi", None)
        pubmed_url = paper.get("pubmed_url", None)

        citation = f"{authors}. \"{title}\" ({year}). Published in {journal}."

        if doi:
            citation += f" DOI: {doi}."
        if pubmed_url:
            citation += f" Available at: {pubmed_url}."

        return citation

    def display_results(self, results: Dict[str, List[Dict[str, str]]]):
        """
        Display the PubMed results in a readable format, including abstracts and citations.

        Args:
            results (Dict[str, List[Dict[str, str]]]): The detailed results from PubMed.
        """
        for section, papers in results.items():
            rprint(f"\n[blue bold]Section: {section}[/blue bold]")
            for i, paper in enumerate(papers):
                abstract = paper.get("abstract")
                if abstract:
                    rprint(f"  [yellow][{i+1}]Abstract:[/yellow] {abstract[:300]}...")
                else:
                    rprint(f"  [red][{i+1}]Abstract:[/red] No abstract available.")

                citation = self.format_citation(paper)
                rprint(f"  [green][{i+1}]Citation:[/green] {citation}")

    def format_results(self, results: Dict[str, List[Dict[str, str]]]) -> List[Dict[str, str]]:
        """
        Format PubMed results into JSON format suitable for LLM input.

        Args:
            results (Dict[str, List[Dict[str, str]]]): The detailed results from PubMed.

        Returns:
            List[Dict[str, str]]: A list of dictionaries containing abstracts and citations for LLM input.
        """
        formatted_results = []

        for section, papers in results.items():
            for paper in papers:
                abstract = paper.get("abstract")
                if not abstract:
                    continue

                citation = self.format_citation(paper)
                formatted_results.append({
                    "section": section,
                    "title": paper.get("title", "Unknown Title"),
                    "abstract": abstract,
                    "citation": citation,
                })

        return formatted_results


# pubmed_client = PubMedAPI(api_key=None)

# # Query PubMed
# results = pubmed_client.query(example_json[:1])

# # # Display results in the terminal
# # pubmed_client.display_results(results)

# # Format results for JSON output
# formatted_results = pubmed_client.format_results(results)
# print(json.dumps(formatted_results, indent=2))
 

In [153]:
example_json[:1]

[{'section': 'Overview', 'query': 'Impact of gut microbiota on mental health'}]

In [160]:
import time
import requests
from typing import List, Dict
from rich import print as rprint
import json


class SemanticScholarAPI:
    def __init__(self, api_key: str, sleep_time: float = 1.0):
        """
        Initialize the SemanticScholarAPI class.

        Args:
            api_key (str): Your Semantic Scholar API key.
            sleep_time (float): Time to wait between API requests to avoid rate-limiting.
        """
        self.api_key = api_key
        self.sleep_time = sleep_time
        self.search_url = "https://api.semanticscholar.org/graph/v1/paper/search"
        self.batch_url = "https://api.semanticscholar.org/graph/v1/paper/batch"
        self.headers = {"x-api-key": self.api_key}

    def query(self, queries: List[Dict[str, str]]) -> Dict[str, List[Dict[str, str]]]:
        """
        Query Semantic Scholar with search queries and fetch detailed information in batches.

        Args:
            queries (List[Dict[str, str]]): List of queries with sections and query text.

        Returns:
            Dict[str, List[Dict[str, str]]]: Results for each section with detailed paper information.
        """
        results = {}

        for query in queries:
            section = query["section"]
            search_query = query["query"]
            search_params = {
                "query": search_query,
                "limit": 10,  # Adjust the limit based on your requirements
            }

            search_response = requests.get(self.search_url, headers=self.headers, params=search_params)
            if search_response.status_code == 200:
                search_data = search_response.json()
                paper_ids = [paper["paperId"] for paper in search_data.get("data", [])]

                if paper_ids:
                    time.sleep(self.sleep_time)
                    # rprint(f"[green]Fetching details for {len(paper_ids)} papers in section '{section}'[/green]")
                    details = self._query_batch(paper_ids)
                    results[section] = details
                else:
                    # print(f"[yellow]No papers found for query: '{search_query}'[/yellow]")
                    results[section] = []
            else:
                rprint(f"[red]Error querying Semantic Scholar for '{search_query}': {search_response.status_code}[/red]")
                results[section] = []

            time.sleep(self.sleep_time)

        return results

    def _query_batch(self, paper_ids: List[str]) -> List[Dict[str, str]]:
        """
        Query the Semantic Scholar batch endpoint for detailed information.

        Args:
            paper_ids (List[str]): List of paper IDs to query.

        Returns:
            List[Dict[str, str]]: A list of paper details with the requested fields.
        """
        fields = "title,abstract,authors,citationCount,referenceCount,url,venue,publicationVenue,year,openAccessPdf"
        payload = {"ids": paper_ids}
        params = {"fields": fields}

        response = requests.post(self.batch_url, headers=self.headers, params=params, json=payload)
        if response.status_code == 200:
            return response.json()
        else:
            rprint(f"[red]Error querying batch endpoint: {response.status_code}[/red]")
            rprint(response.text)
            return []

    def format_citation(self, paper: Dict) -> str:
        """
        Format the citation for a paper using available metadata from Semantic Scholar.

        Args:
            paper (Dict): The paper metadata returned by Semantic Scholar.

        Returns:
            str: Formatted citation with URL or DOI.
        """
        authors = ", ".join(author.get("name", "Unknown") for author in paper.get("authors", [])[:3])
        if len(paper.get("authors", [])) > 3:
            authors += " et al."

        title = paper.get("title", "Unknown Title")
        year = paper.get("year", "Unknown Year")
        venue = paper.get("venue", "Unknown Venue")

        publication_venue = paper.get("publicationVenue") or {}  # Default to empty dict if None
        publication_name = publication_venue.get("name", venue)
        publication_url = publication_venue.get("url", "")

        doi = paper.get("externalIds", {}).get("DOI", None)
        open_access_pdf = paper.get("openAccessPdf") or {}
        pdf_url = open_access_pdf.get("url", None)
        general_url = paper.get("url", None)

        citation = f"{authors}. \"{title}\" ({year}). Published in {publication_name}."

        if doi:
            citation += f" DOI: {doi}."
        elif pdf_url:
            citation += f" Open Access PDF: {pdf_url}."
        elif general_url:
            citation += f" Available at: {general_url}."

        if publication_url:
            citation += f" Publication Info: {publication_url}."

        return citation

    def display_results(self, results: Dict[str, List[Dict[str, str]]]):
        """
        Display the Semantic Scholar results in a readable format, including abstracts and citations.

        Args:
            results (Dict[str, List[Dict[str, str]]]): The detailed results from Semantic Scholar.
        """
        for section, papers in results.items():
            rprint(f"\n[blue bold]Section: {section}[/blue bold]")
            for i, paper in enumerate(papers):
                abstract = paper.get("abstract")
                if abstract:
                    rprint(f"  [yellow][{i+1}]Abstract:[/yellow] {abstract[:300]}...")
                else:
                    rprint(f"  [red][{i+1}]Abstract:[/red] No abstract available.")

                citation = self.format_citation(paper)
                rprint(f"  [green][{i+1}]Citation:[/green] {citation}")

    def format_results(self, results: Dict[str, List[Dict[str, str]]]) -> List[Dict[str, str]]:
        """
        Format Semantic Scholar results into JSON format suitable for LLM input.

        Args:
            results (Dict[str, List[Dict[str, str]]]): The detailed results from Semantic Scholar.

        Returns:
            List[Dict[str, str]]: A list of dictionaries containing abstracts and citations for LLM input.
        """
        formatted_results = []

        for section, papers in results.items():
            for paper in papers:
                abstract = paper.get("abstract")
                if not abstract:
                    continue

                citation = self.format_citation(paper)
                formatted_results.append({
                    "section": section,
                    "title": paper.get("title", "Unknown Title"),
                    "abstract": abstract,
                    "citation": citation,
                })

        return formatted_results


# example_queries = [
#     {"section": "Overview", "query": "Impact of gut microbiota on mental health"},
#     {"section": "Key Facts", "query": "Prevalence of gut microbiota imbalances"},
#     {"section": "Causes", "query": "Factors influencing gut microbiota dysbiosis"}
# ]

# 

# # Initialize the SemanticScholarAPI class
# semantic_scholar_client = SemanticScholarAPI(api_key=SEMANTIC_SCHOLAR_API_KEY)

# # Query Semantic Scholar
# results = semantic_scholar_client.query(example_queries[:1])

# # Display results
# semantic_scholar_client.display_results(results)

# # Format results for JSON output
# formatted_results = semantic_scholar_client.format_results(results)
# print(json.dumps(formatted_results, indent=2))


In [161]:
def integrate_papers(article: str, papers: List[dict], model: GenerativeModel) -> str:
    """
    Integrate relevant scientific papers into the provided article using the model.

    Args:
        article (str): The initial article or outline to be enhanced.
        papers (List[dict]): A list of scientific paper details to integrate.
        model (GenerativeModel): The generative model instance for processing.

    Returns:
        str: The revised and complete article with integrated references.
    """
    
    prompt = f"""
    You are a professional scientific writer tasked with integrating references into an existing article to enhance its credibility, depth, and clarity.

    TASK:
    - Review the provided article and the list of references (papers).
    - Carefully integrate the references into the article by linking them directly to relevant claims, data, or insights.
    - Maintain all existing content, hyperlinks, and citations in the article. Do not remove or replace any existing links or references unless explicitly instructed.
    - Expand on claims where appropriate using data or findings from the provided references to enhance the article's authority and comprehensiveness.

    ARTICLE:
    {article}

    REFERENCES:
    {json.dumps(papers, indent=2)}

    GUIDELINES:
    1. Ensure each reference is used meaningfully and is directly linked to a relevant point in the article.
    2. Do not introduce vague or generic references (e.g., "[1] Previously mentioned links").
    3. Preserve all existing hyperlinks and citations; only add new ones where necessary.
    4. Clearly hyperlink references in-line with markdown formatting, ensuring a seamless integration into the text.
    5. Summarize or quote key findings from the references in context, ensuring the article remains cohesive and natural in tone.
    6. Do not add redundant information or alter the core focus or flow of the article.

    OUTPUT REQUIREMENTS:
    - Return only the revised article with integrated references in professional markdown formatting.
    - Ensure all citations and references are formatted consistently and linked appropriately.
    - Avoid any extraneous commentary or explanations in your response.
    - Retain the article's clarity, professional tone, and readability.
    """
    response = model.generate_content(prompt)
    return response.text


In [None]:
from datetime import datetime  # Import the `datetime` class from the `datetime` module
import os

output_dir = '/Users/vince/Salk/PaperGeneration/data/output'

def save_results(article, topic, output_dir=output_dir):
    """Saves the generated article to a markdown file.

    Args:
        article (str): The generated article text.
        topic (str): The topic of the article. Used in the filename.
        output_dir (str): The directory to save the results.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # Use the `datetime` class correctly
    filename = f"{topic.replace(' ', '_')}_{timestamp}.md"  # Generate a robust filename
    filepath = os.path.join(output_dir, filename)

    try:
        os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

        with open(filepath, 'w') as f:
            f.write(article)

        print(f"Results saved to: {filepath}")

    except Exception as e:
        print(f"Error saving results: {e}")

# # Test the function
# save_results("This is a test article.", "Mosquito-borne Diseases", output_dir)


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Mosquito-borne_Diseases_20241205_141634.md


In [157]:
import json

uptodate_results = "search_results.json"

try:
    with open(uptodate_results, 'r') as f:
        data = json.load(f)
    # Now 'data' holds the contents of the JSON file as a Python dictionary or list.
    print(data) # Or do something else with the loaded data.

except FileNotFoundError:
    print(f"Error: File '{uptodate_results}' not found.")
except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in '{uptodate_results}'.")

[{'query': 'Mosquito Bites', 'results': [{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/insect-allergy/allergic-reactions-to-mosquito-bites.md', 'score': 20.802518981821166}, {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/emergency-medicine-adult-and-pediatric/adult-environmental-emergencies/insect-and-other-arthropod-bites.md', 'score': 20.42948032602174}, {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/emergency-medicine-adult-and-pediatric/pediatric-environmental-emergencies/insect-and-other-arthropod-bites.md', 'score': 20.42948032602174}, {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/insect-and-other-arthropod-bites.md', 'score': 20.42948032602174}, {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/insect-allergy/insect-and-other-arthropod-bites.md', 'score': 20.429

In [None]:
import os 
from rich import print as rprint

i = 0
topic_query = data[i]

topic = topic_query['query']
print(f"[{i+1}/{len(data)}] Topic: {topic}")
print("Generating Outline...")
outline = generate_outline(topic, model)
rprint(outline)



[1/271] Topic: Mosquito Bites
Generating Outline...


In [117]:
import os
print("\nFetching Uptodate Article...")
results = topic_query['results']
if results:  # Check if results is not empty
    uptodate_path = results[0].get('path').replace('.md','.json') # Use .get to handle missing 'path' key
    if uptodate_path and os.path.exists(uptodate_path): #Check if path exists
        try:
            with open(uptodate_path, 'r') as f:
                uptodate_markdown = f.read()
            print("Integrating Uptodate Article...")
            refined_outline = refine_outline_with_uptodate(topic, outline, uptodate_markdown, model)
        except Exception as e:  # Catch specific exceptions and print the error message
            print(f"Error reading Uptodate article: {e}")
    else:
        print(f"Error: Uptodate path is invalid or missing: {uptodate_path}")

else:
    print("Error: No results found in topic_query.")





Fetching Uptodate Article...
Integrating Uptodate Article...


In [118]:
rprint(refined_outline)

In [130]:

print("\nGenerating Search Queries...")
queries = generate_search_query_response(outline, model)





Generating Search Queries...


In [138]:
parsed_queries = [q.model_dump() for q in parse_search_queries(queries)]
parsed_queries

[{'section': 'Overview', 'query': 'Global impact of mosquito-borne diseases'},
 {'section': 'Key Facts', 'query': 'Mosquitoes leading cause of human death'},
 {'section': 'Key Facts', 'query': 'Prevalence of mosquito bites'},
 {'section': 'Symptoms',
  'query': 'Large local reactions to mosquito bites and immune response'},
 {'section': 'Types', 'query': 'Mosquito species and disease transmission'},
 {'section': 'Causes', 'query': 'Mosquito saliva and immune response'},
 {'section': 'Risk Factors', 'query': 'Blood type and mosquito attraction'},
 {'section': 'Risk Factors', 'query': 'Genetics and mosquito attraction'},
 {'section': 'Complications', 'query': 'Mosquito bite complications'},
 {'section': 'Alternative Therapies',
  'query': 'Essential oils for mosquito bite relief'},
 {'section': 'FAQs',
  'query': 'Factors influencing mosquito bite susceptibility'}]

In [139]:
for query in parsed_queries:
    print(query['section'])

Overview
Key Facts
Key Facts
Symptoms
Types
Causes
Risk Factors
Risk Factors
Complications
Alternative Therapies
FAQs


In [142]:

print("\nFetching PubMed Results...")
pubmed_client = PubMedAPI(api_key=None)
pubmed_results = pubmed_client.query(parsed_queries)
pubmed_results = pubmed_client.format_results(pubmed_results)


Fetching PubMed Results...
No results found for query: 'Essential oils for mosquito bite relief'


In [144]:
pubmed_results

[{'section': 'Overview',
  'title': 'The 1,7-malaria reactive community-based testing and response (1,7-mRCTR) approach in Tanzania: a cost-effectiveness analysis.',
  'abstract': 'Reactive case detection (RACD) for malaria control has been found effective in low transmission settings, but its impact and cost-effectiveness in moderate-to-high transmission settings are unknown. We conducted an economic evaluation alongside an empirical trial of a modified RACD strategy (1,7-mRCTR) in three moderate-to-high malaria transmission districts in Tanzania.',
  'citation': 'Tampi Radhika Pradip, Wang Duoquan, Abdulla Salim et al.. "The 1,7-malaria reactive community-based testing and response (1,7-mRCTR) approach in Tanzania: a cost-effectiveness analysis." (2024). Published in Infectious diseases of poverty. DOI: 10.1186/s40249-024-01261-w. Available at: https://pubmed.ncbi.nlm.nih.gov/39633463/.'},
 {'section': 'Overview',
  'title': 'Innovative sterile male release strategies for Aedes mosqu

In [143]:


print("\nFetching Semantic Scholar Results...")
semantic_scholar_client = SemanticScholarAPI(api_key=SEMANTIC_SCHOLAR_API_KEY)
semantic_results = semantic_scholar_client.query(parsed_queries)
semantic_results = semantic_scholar_client.format_results(semantic_results)




Fetching Semantic Scholar Results...


In [145]:
semantic_results

[{'section': 'Overview',
  'title': 'Impact of mosquito-borne diseases on global public health',
  'abstract': 'Mosquito-borne diseases are a concern of global public health and safety. Risk of mosquito- borne illness varies greatly with occupation, age, ethnicity, gender, income status, travel frequency, and climate change. Those at most risk of being infected by a mosquito-borne disease include frequent travelers, healthcare personnel, laboratory workers, and those whose occupation is mostly conducted outside or in wooded areas. Many people become sick after being bitten by an infected mosquito. Some people have mild short-term illness and in cases some people experience long-term illnesses. Severe cases of mosquito-borne diseases have resulted in death. Recommendations include investing in climate change, increasing the availability of vaccines globally, the use of health and mosquito surveillance systems, and forming regional and international action plans for disease control in th

In [146]:
print("\nIntegrating Papers...")
papers = pubmed_results + semantic_results
final_article = integrate_papers(outline, papers, model)



Integrating Papers...


In [147]:
rprint(final_article)

In [163]:
import os
from rich import print as rprint

# Function to process a single topic query
def process_topic_query(i, topic_query, model, pubmed_client, semantic_scholar_client):
    try:
        topic = topic_query['query']
        rprint(f"[{i+1}/{len(data)}] Topic: {topic}")
        rprint("Generating Outline...")
        outline = generate_outline(topic, model)

        topic_dir = os.path.join(output_dir, topic.replace(' ', '_'))
        os.makedirs(topic_dir, exist_ok=True)
        save_results(outline, topic + "_outline", topic_dir)

        # Fetch and integrate Uptodate article
        uptodate_markdown = None
        results = topic_query.get('results', [])
        if results:
            uptodate_path = results[0].get('path', '').replace('.md', '.json')
            if uptodate_path and os.path.exists(uptodate_path):
                with open(uptodate_path, 'r') as f:
                    uptodate_markdown = f.read()
                rprint("Integrating Uptodate Article...")
                outline = refine_outline_with_uptodate(topic, outline, uptodate_markdown, model)
                save_results(outline, topic + "_uptodate", topic_dir)
            else:
                rprint(f"[yellow]Warning: Uptodate path is invalid or missing: {uptodate_path}[/yellow]")
        else:
            rprint("[yellow]Warning: No results found in topic_query.[/yellow]")

        # Generate search queries
        rprint("Generating Search Queries...")
        queries = generate_search_query_response(outline, model)
        parsed_queries = [q.model_dump() for q in parse_search_queries(queries)]

        # Fetch PubMed results
        rprint("Fetching PubMed Results...")
        pubmed_results = pubmed_client.query(parsed_queries)
        pubmed_results = pubmed_client.format_results(pubmed_results)

        # Fetch Semantic Scholar results
        rprint("Fetching Semantic Scholar Results...")
        semantic_results = semantic_scholar_client.query(parsed_queries)
        semantic_results = semantic_scholar_client.format_results(semantic_results)

        # Integrate papers into the article
        rprint("Integrating Papers...")
        papers = pubmed_results + semantic_results
        final_article = integrate_papers(outline, papers, model)
        save_results(final_article, topic + "_final", topic_dir)

        rprint(f"[green]Processing complete for topic: {topic}[/green]")
        return final_article

    except Exception as e:
        rprint(f"[red]Error processing topic {i+1}: {e}[/red]")
        return None

# Initialize clients and iterate over data
pubmed_client = PubMedAPI(api_key=None)
semantic_scholar_client = SemanticScholarAPI(api_key=os.getenv("SEMANTIC_SCHOLAR_API_KEY"))

all_articles = []

for i, topic_query in enumerate(data):
    rprint(f"\n[cyan bold]Processing Topic {i+1}[/cyan bold]")
    final_article = process_topic_query(i, topic_query, model, pubmed_client, semantic_scholar_client)
    if final_article:
        all_articles.append(final_article)
        # save_results(final_article, topic_query['query'], output_dir)
    if i == 5:
        break


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Mosquito_Bites/Mosquito_Bites_outline_20241205_141720.md


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Mosquito_Bites/Mosquito_Bites_uptodate_20241205_141806.md


No results found for query: 'Mosquito bite reactions in newcomers to endemic areas'
No results found for query: 'Oral antihistamines for mosquito bite relief'
No results found for query: 'Use of oral glucocorticoids for severe mosquito bite reactions'
No results found for query: 'Home remedies for mosquito bite itching'


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Mosquito_Bites/Mosquito_Bites_final_20241205_142543.md


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Tick_Bites/Tick_Bites_outline_20241205_142605.md


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Tick_Bites/Tick_Bites_uptodate_20241205_142630.md


No results found for query: 'Correlation tick bites occupational hazards'


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Tick_Bites/Tick_Bites_final_20241205_142909.md


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Spider_Bites/Spider_Bites_outline_20241205_142930.md


Results saved to: /Users/vince/Salk/PaperGeneration/data/output/Spider_Bites/Spider_Bites_uptodate_20241205_142957.md


KeyboardInterrupt: 

In [18]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel('models/gemini-1.5-flash-002')


response = model.generate_content(contents="Who won Wimbledon this year?",
                                  tools='google_search_retrieval')

print(response)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Carlos Alcaraz won the men's singles title at Wimbledon 2024, defeating Novak Djokovic in the final with a score of 6-2, 6-2, 7-6 (7-4).  This was Alcaraz's second consecutive Wimbledon victory and his fourth Grand Slam title overall.  He is the sixth (and youngest) man in the Open Era to achieve a Channel Slam (winning both Wimbledon and the French Open in the same year).\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "grounding_metadata": {
            "search_entry_point": {
              "rendered_content": "<style>\n.container {\n  align-items: center;\n  border-radius: 8px;\n  display: flex;\n  font-family: Google Sans, Roboto, sans-serif;\n  font-size: 14px;\n  line-height: 20px;

In [None]:

def integrate_papers(topic:str, article: str, papers: List[dict], model: GenerativeModel) -> str:
    """
    Integrate relevant scientific papers into the provided article using the model.

    Args:
        topic (str): The topic of the knowledgebase article.
        article (str): The article to be enhanced.
        papers (List[dict]): A list of scientific paper details to integrate.
        model (GenerativeModel): The generative model instance for processing.

    Returns:
        str: The revised and complete article with integrated references.
    """

    prompt = f"""
You are a professional scientific writer tasked with integrating relevant references for an existing knowledgebase article (ARTICLE) on the topic: '{topic}'.

### TASK:
- For each section in the ARTICLE:
  1. Identify key claims or statements that could be enhanced with additional references.
  2. Review the provided references (PAPERS) to find data or findings that directly support or expand upon the claim.
  3. Add citations or expand the section as necessary using a professional, concise tone.

### GUIDELINES:
1. **Relevance:** Use papers selectively. Prioritize those with abstracts or conclusions explicitly aligned with the article's claims or topics.
2. **Citation Style:** Use APA-style citations with DOI hyperlinks wherever possible.
3. **Preservation:** Do not remove existing references, hyperlinks, or content unless explicitly improving accuracy or clarity.
4. **Clarity and Depth:** Expand claims with data or findings from references where appropriate, but avoid redundancy or irrelevant information.
5. **Formatting:** Preserve the article’s structure, markdown formatting, and cohesive flow.
6. **Error Handling:** If no papers are relevant to a section, skip their inclusion.


### OUTPUT REQUIREMENTS:
- Return only the revised article in markdown format with integrated references.
- Ensure all citations and references are consistently formatted and properly linked.
- Do not add any extraneous commentary or notes outside of the revised article.

### INPUT

<ARTICLE>
{article}
</ARTICLE>

<PAPERS>
{json.dumps(papers, indent=2)}
</PAPERS>
    """

    return prompt

topic = "Cavities (Tooth Decay)"

topic_dir = "/Users/vince/Salk/PaperGeneration/data/output/Cavities_(Tooth_Decay)"

papers_file = os.path.join(topic_dir, "Cavities_(Tooth_Decay)_papers_20241205_173546.md")

with open(papers_file, 'r') as f:
    papers = json.load(f) 

# Load markdown  
uptodate_file = os.path.join(topic_dir, "Cavities_(Tooth_Decay)_uptodate_20241205_173423.md")
with open(uptodate_file, 'r') as f:
    article = f.read()

prompt_result = integrate_papers(topic, article, papers, model)
prompt_result


'\n    You are a professional scientific writer tasked with integrating relevant references for an existing knowledgebase article on the topic: \'Cavities (Tooth Decay)\'.\n\n    GOALS:\n    - Use the provided references (papers) as supplementary information to enhance the article where applicable.\n    - Prioritize maintaining the article\'s existing structure, references, and hyperlinks. Do not remove or replace any existing references or content unless explicitly improving accuracy or clarity.\n    - If none of the provided papers are directly relevant to a section, do not force their inclusion. Instead, treat the references as additional resources.\n\n    TASK:\n    - Review the provided article and the list of references (papers).\n    - Integrate relevant findings from the references to enhance specific claims or sections in the article.\n    - Clearly hyperlink references in-line using markdown formatting to ensure a seamless integration into the text.\n    - Maintain a professi

In [174]:
rprint(prompt_result)



In [175]:
file = "/Users/vince/Salk/PaperGeneration/data/output/Type_2_diabetes/Type_2_diabetes_papers_20241209_125004.md"

with open(file, 'r') as f:
    papers = json.load(f)

rprint(papers)