In [1]:
import os 
import json
import datetime
import pandas as pd 

In [2]:
file_path = "/Users/vince/Salk/PaperGeneration/data/output/Food_Allergy/Food_Allergy_queries_20250113_140951.json"

with open (file_path, 'r') as f:
    data = json.load(f)

In [3]:
parsed_queries = data['root']
parsed_queries

[{'section': 'overview', 'query': 'food allergy prevalence adults children'},
 {'section': 'key_facts', 'query': 'common food allergens'},
 {'section': 'key_facts', 'query': 'food allergy anaphylaxis symptoms'},
 {'section': 'key_facts', 'query': 'food allergy genetic predisposition'},
 {'section': 'types', 'query': 'IgE mediated food allergy'},
 {'section': 'types', 'query': 'non IgE mediated food allergy'},
 {'section': 'types', 'query': 'delayed onset food allergy'},
 {'section': 'causes', 'query': 'food allergy immune response mechanism'},
 {'section': 'causes',
  'query': 'food allergy genetic factors environmental factors'},
 {'section': 'causes', 'query': 'food allergy gut microbiota'},
 {'section': 'causes', 'query': 'food allergy immune tolerance'},
 {'section': 'causes', 'query': 'cesarean delivery food allergy risk'},
 {'section': 'causes', 'query': 'high fat diet food allergy risk'},
 {'section': 'risk_factors', 'query': 'family history food allergy'},
 {'section': 'risk_fa

In [4]:
import sys
import os

# Get the absolute path to the parent directory
parent_path = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
if parent_path not in sys.path:
    sys.path.append(parent_path)

# Now you can import your modules
from src.semanticscholar import SemanticScholarAPI

In [5]:
from dotenv import load_dotenv
import logging
from rich.logging import RichHandler

LOG_FILE_NAME= "debug.log"


def setup_logger(output_dir: str) -> logging.Logger:
    """Sets up the logger to log to both the console and a file."""
    log_file = os.path.join(output_dir, LOG_FILE_NAME)
    os.makedirs(output_dir, exist_ok=True)

    logger = logging.getLogger("PaperGenerationPipeline")
    logger.setLevel(logging.INFO)

    log_format = "%(asctime)s [%(levelname)s] %(message)s"

    # Console Handler
    console_handler = RichHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(logging.Formatter(log_format))
    logger.addHandler(console_handler)

    # File Handler
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(logging.Formatter(log_format))
    logger.addHandler(file_handler)

    logger.info(f"Logging initialized. Log file: {log_file}")
    return logger

load_dotenv()



logger = setup_logger(os.getcwd())

SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
JOURNALS_FILE = "/Users/vince/Salk/PaperGeneration/data/journals_df.csv"

semantic_client = SemanticScholarAPI(SEMANTIC_SCHOLAR_API_KEY, logger=logger)

semantic_client.load_journal_sjr_data(JOURNALS_FILE)

In [6]:
semantic_results = await semantic_client.query(1, parsed_queries[:10])
papers = semantic_client.format_results(semantic_results)

In [7]:
papers

[Paper(section='overview', query='food allergy prevalence adults children', title='The Prevalence, Severity, and Distribution of Childhood Food Allergy in the United States', abstract='OBJECTIVE: The goal of this study was to better estimate the prevalence and severity of childhood food allergy in the United States. METHODS: A randomized, cross-sectional survey was administered electronically to a representative sample of US households with children from June 2009 to February 2010. Eligible participants included adults (aged 18 years or older) able to complete the survey in Spanish or English who resided in a household with at least 1 child younger than 18 years. Data were adjusted using both base and poststratification weights to account for potential biases from sampling design and nonresponse. Data were analyzed as weighted proportions to estimate prevalence and severity of food allergy. Multiple logistic regression models were constructed to identify characteristics significantly a

In [8]:
top_papers = semantic_client.select_top_papers(papers)

In [13]:
len(top_papers)

50

In [9]:
from rich import print as rprint
rprint(papers[:10])

In [24]:
sorted_papers = sorted(papers, key=lambda paper: paper.citationCount, reverse=True)
sorted_papers

[Paper(section='overview', query='food allergy prevalence adults children', title='The Prevalence, Severity, and Distribution of Childhood Food Allergy in the United States', abstract='OBJECTIVE: The goal of this study was to better estimate the prevalence and severity of childhood food allergy in the United States. METHODS: A randomized, cross-sectional survey was administered electronically to a representative sample of US households with children from June 2009 to February 2010. Eligible participants included adults (aged 18 years or older) able to complete the survey in Spanish or English who resided in a household with at least 1 child younger than 18 years. Data were adjusted using both base and poststratification weights to account for potential biases from sampling design and nonresponse. Data were analyzed as weighted proportions to estimate prevalence and severity of food allergy. Multiple logistic regression models were constructed to identify characteristics significantly a

In [11]:
len(parsed_queries)

34

In [27]:
papers_json = json.dumps([paper.model_dump(mode='json') for paper in sorted_papers],indent=2)
papers_json

'[\n  {\n    "section": "overview",\n    "query": "food allergy prevalence adults children",\n    "title": "The Prevalence, Severity, and Distribution of Childhood Food Allergy in the United States",\n    "abstract": "OBJECTIVE: The goal of this study was to better estimate the prevalence and severity of childhood food allergy in the United States. METHODS: A randomized, cross-sectional survey was administered electronically to a representative sample of US households with children from June 2009 to February 2010. Eligible participants included adults (aged 18 years or older) able to complete the survey in Spanish or English who resided in a household with at least 1 child younger than 18 years. Data were adjusted using both base and poststratification weights to account for potential biases from sampling design and nonresponse. Data were analyzed as weighted proportions to estimate prevalence and severity of food allergy. Multiple logistic regression models were constructed to identif

In [None]:
papers_json

AttributeError: 'list' object has no attribute 'model_dump_json'

In [29]:
# src/models.py

from pydantic import BaseModel, Field, HttpUrl, RootModel
from typing import List, Optional, Union

from langchain_core.output_parsers import PydanticOutputParser


# Search queries response format
class SearchQuery(BaseModel):
    section: str = Field(..., description="The section of the outline the query corresponds to.")
    query: str = Field(..., description="The search query for this section.")


class SearchQueryList(BaseModel):
    """
    Wrapper class for a list of SearchQuery objects.
    """
    root: List[SearchQuery] = Field(..., description="List of search queries.")


parser = PydanticOutputParser(pydantic_object=SearchQueryList)

parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"SearchQuery": {"properties": {"section": {"description": "The section of the outline the query corresponds to.", "title": "Section", "type": "string"}, "query": {"description": "The search query for this section.", "title": "Query", "type": "string"}}, "required": ["section", "query"], "title": "SearchQuery", "type": "object"}}, "description": "Wrapper class for a list of SearchQuery objects.", "properties": {"root": {"description": "List of search queries.", "items": {"$ref": "#/$defs/SearchQuery"}, "title": "Root", "type"

In [30]:

parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"SearchQuery": {"properties": {"section": {"description": "The section of the outline the query corresponds to.", "title": "Section", "type": "string"}, "query": {"description": "The search query for this section.", "title": "Query", "type": "string"}}, "required": ["section", "query"], "title": "SearchQuery", "type": "object"}}, "description": "Wrapper class for a list of SearchQuery objects.", "properties": {"root": {"description": "List of search queries.", "items": {"$ref": "#/$defs/SearchQuery"}, "title": "Root", "type"

In [31]:
class ReferenceItem(BaseModel):
    reference_number: int = Field(..., description="Sequential reference number.")
    authors: Optional[str] = Field(None, description="Author(s) of the reference.")
    year: Optional[Union[int, str]] = Field(None, description="Publication year.")
    title: Optional[str] = Field(None, description="Title of the reference.")
    journal_source: Optional[str] = Field(None, description="Journal or source of publication.")
    url_doi: Optional[str] = Field(None, description="URL or DOI of the reference.")

parser = PydanticOutputParser(pydantic_object=ReferenceItem)


parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"reference_number": {"description": "Sequential reference number.", "title": "Reference Number", "type": "integer"}, "authors": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Author(s) of the reference.", "title": "Authors"}, "year": {"anyOf": [{"type": "integer"}, {"type": "string"}, {"type": "null"}], "default": null, "description": "Publication year.", "title": "Year"}, "title": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Title of the referenc

In [35]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"reference_number": {"description": "Sequential reference number.", "title": "Reference Number", "type": "integer"}, "authors": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Author(s) of the reference.", "title": "Authors"}, "year": {"anyOf": [{"type": "integer"}, {"type": "string"}, {"type": "null"}], "default": null, "description": "Publication year.", "title": "Year"}, "title": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Title of the referenc