## Imports

In [2]:
import asyncio
import concurrent.futures
import json
import re
from random import Random

from openai import AsyncOpenAI, OpenAI
from tqdm import tqdm

from virtual_lab.agent import Agent
from virtual_lab.constants import CONSISTENT_TEMPERATURE, DEFAULT_FINETUNING_EPOCHS
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import (
    async_get_messages,
    compute_finetuning_cost,
    compute_token_cost,
    count_tokens,
    get_pubmed_central_article,
)

from nanobody_constants import (
    background_prompt,
    nanobody_prompt,
    discussions_phase_to_dir,
    model as base_model,
    model_mini as base_model_mini,
    generic_agent,
    immunologist,
    machine_learning_specialist,
    computational_biologist,
)

## Setup

In [3]:
# Constants
finetuning_dir = discussions_phase_to_dir["finetuning"]
papers_dir = finetuning_dir / "papers"
summaries_dir = finetuning_dir / "summaries"
qa_dir = finetuning_dir / "qa_pairs"

for dir_path in [finetuning_dir, papers_dir, summaries_dir, qa_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

client = OpenAI()
async_client = AsyncOpenAI()
num_concurrent = 10
max_tokens = 250000

In [4]:
# Topic to agent mapping
topic_to_agent = {
    "nanobodies": immunologist,
    "SARS-CoV-2 spike protein": immunologist,
    "SARS-CoV-2 variants KP.3 and JN.1": immunologist,
    "ESM": machine_learning_specialist,
    "AlphaFold-Multimer": computational_biologist,
    "Rosetta": computational_biologist,
}

In [5]:
# Agent to topics mapping
agent_to_topics = {agent: [] for agent in topic_to_agent.values()}

for topic, agent in topic_to_agent.items():
    agent_to_topics[agent].append(topic)

In [6]:
%autoawait asyncio

In [7]:
async def run_query(
    semaphore: asyncio.Semaphore, agent: Agent, query: str, model: str = base_model
) -> str:
    """Run a query using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for the query.
    :param query: Query to run.
    :param model: Model to use for the query.
    :return: Response from the model.
    """
    async with semaphore:
        assistant = await async_client.beta.assistants.create(
            name=agent.title, instructions=agent.prompt, model=model
        )
        thread = await async_client.beta.threads.create()
        await async_client.beta.threads.messages.create(
            thread_id=thread.id, role="user", content=query
        )
        await async_client.beta.threads.runs.create_and_poll(
            thread_id=thread.id,
            assistant_id=assistant.id,
            model=model,
            temperature=CONSISTENT_TEMPERATURE,
        )
        messages = await async_get_messages(client=async_client, thread_id=thread.id)
        response = messages[-1]["content"][0]["text"]["value"]

    return response

## PubMed Central search queries

Use the agents to generate PubMed Central search queries by topic.

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait(
        [
            executor.submit(
                run_meeting,
                meeting_type="individual",
                team_member=agent,
                agenda=f'{background_prompt} {nanobody_prompt} You are responsible for understanding the topic "{topic}" in the context of designing nanobody binders for SARS-CoV-2. You need to fine-tune yourself on the relevant literature on {topic} to improve your ability to design SARS-CoV-2 nanobody binders. Please write out a series of five distinct search queries that you want to run to find relevant scientific papers on {topic}. Include both queries about {topic} generally as well as queries about how {topic} relates to designing nanobody binders for SARS-CoV-2. Please provide the queries in Python syntax as a list of double-quoted strings.',
                agenda_questions=(
                    f"What are the queries that you want to perform to identify the relevant literature on {topic} (as a list of double-quoted strings in Python syntax)?",
                ),
                save_dir=finetuning_dir,
                save_name=f"{topic.replace(' ', '_')}_queries",
                temperature=CONSISTENT_TEMPERATURE,
            )
            for topic, agent in topic_to_agent.items()
        ]
    )

Extract the search queries from the agent responses.

In [7]:
# Set up regex pattern for extracting queries
query_list_pattern = re.compile(r'\[\s*(".*?"\s*(,\s*".*?"\s*)*)?,?\s*\]')

topic_to_queries = {}

for topic, agent in topic_to_agent.items():
    # Get query path for topic
    query_path = finetuning_dir / f"{topic.replace(' ', '_')}_queries.json"

    # Load query discussion
    with open(query_path) as f:
        query_discussion = json.load(f)

    # Extract queries
    query_message = query_discussion[-1]["message"]
    pattern_result = query_list_pattern.search(query_message)

    # Check if pattern is matched
    if pattern_result is None:
        print(f"No queries found for {query_path}")
        continue

    # Extract queries
    queries = json.loads(pattern_result.group())
    topic_to_queries[topic] = queries

## PubMed Central papers

Have the agents find papers on PubMed Central using the search queries (100 papers per query) and evaluate them based on their abstracts.

In [None]:
for topic, agent in topic_to_agent.items():
    with concurrent.futures.ThreadPoolExecutor() as executor:
        concurrent.futures.wait(
            [
                executor.submit(
                    run_meeting,
                    meeting_type="individual",
                    team_member=agent,
                    agenda=f'{background_prompt} {nanobody_prompt} You are responsible for understanding the topic "{topic}" in the context of designing nanobody binders for SARS-CoV-2. You need to fine-tune yourself on the relevant literature on {topic} to improve your ability to design SARS-CoV-2 nanobody binders. Please use PubMed Central and search for relevant papers on {topic} using the query "{query}" and request 100 articles with abstracts only. Read all of the abstracts and based on each abstract individually, decide whether you want to fine-tune yourself on the full text of that paper. Include as many papers as possible, but only include papers that are directly relevant to {topic}. Please provide the PMCIDs and titles of all the papers that you wish to fine-tune yourself on as a Python dictionary mapping PMCID as a double-quoted string to title as a double-quoted string.',
                    agenda_questions=(
                        "What are the PMCIDs and titles of the papers you wish to fine-tune yourself on (as a Python dictionary mapping PMCID as a double-quoted string to title as double-quoted string)?",
                    ),
                    save_dir=finetuning_dir,
                    save_name=f"{topic.replace(' ', '_')}_papers_{query_num + 1}",
                    temperature=CONSISTENT_TEMPERATURE,
                    pubmed_search=True,
                )
                for query_num, query in enumerate(topic_to_queries[topic])
                if not (
                    discussions_phase_to_dir["finetuning"]
                    / f"{topic.replace(' ', '_')}_papers_{query_num + 1}.json"
                ).exists()
            ]
        )

Extract the selected papers from the agent responses.

In [None]:
# Set up regex pattern for extracting queries
pmcid_to_title_pattern = re.compile(
    r'\{\s*(".*?"\s*:\s*".*?"\s*(,\s*".*?"\s*:\s*".*?"\s*)*)?\}'
)

for topic, agent in topic_to_agent.items():
    # Set up title to PMC ID dictionary
    title_to_pmcid = {}
    titles_lower, pmcids = set(), set()
    topic_name = topic.replace(" ", "_")

    # Get all paper paths for a topic
    paper_paths = sorted(finetuning_dir.glob(f"{topic_name}_papers_*.json"))

    # Check if all papers results are present
    if len(paper_paths) != 5:
        print(f"Missing papers for {topic}")
        continue

    # Extract PMC IDs and titles from each papers file
    for paper_path in paper_paths:
        # Load paper discussion
        with open(paper_path) as f:
            paper_discussion = json.load(f)

        # Extract PMC IDs and titles dictionary
        paper_message = paper_discussion[1]["message"]
        pattern_result = pmcid_to_title_pattern.search(paper_message)

        # Check if pattern is matched
        if pattern_result is None:
            print(f"No papers found for {paper_path}")
            continue

        # Extract PMC IDs and titles dictionary
        pmcid_to_title = json.loads(pattern_result.group())

        # Add PMC IDs and titles to dictionary, avoiding duplicates
        for pmcid, title in pmcid_to_title.items():
            # Replace en dash and em dash with a hyphen and convert to lowercase
            title = title.replace("–", "-").replace("—", "-")
            title_lower = title.lower()

            if title_lower not in titles_lower and pmcid not in pmcids:
                title_to_pmcid[title] = pmcid
                titles_lower.add(title_lower)
                pmcids.add(pmcid)

    print(f"Number of papers found for {topic}: {len(title_to_pmcid):,}")

    # Save title to PMC ID dictionary
    with open(finetuning_dir / f"{topic_name}_title_to_pmcid.json", "w") as f:
        json.dump(title_to_pmcid, f, indent=4, sort_keys=True)

Load the PMCIDs and titles from the agent responses.

In [None]:
pmcids = set()

for topic in topic_to_agent:
    topic_name = topic.replace(" ", "_")

    with open(finetuning_dir / f"{topic_name}_title_to_pmcid.json") as f:
        title_to_pmcid = json.load(f)

    pmcids.update(title_to_pmcid.values())

print(f"Number of unique PMCIDs: {len(pmcids):,}")

Download the papers from PubMed Central.

In [None]:
paper_count = 0

for pmcid in tqdm(sorted(pmcids)):
    title, content = get_pubmed_central_article(pmcid=pmcid)

    if title is None:
        continue

    paper_count += 1

    # Save paper
    with open(papers_dir / f"{pmcid}.json", "w") as f:
        json.dump({"title": title, "content": content}, f, indent=4, sort_keys=True)

print(f"Number of papers downloaded: {paper_count:,}")

## Summarize papers

Define a function to using an agent to summarize a paper.

In [10]:
async def summarize_paper(
    semaphore: asyncio.Semaphore,
    agent: Agent,
    topic: str,
    pmcid: str,
    title: str,
    content: list[str],
) -> tuple[str, str, str]:
    """Summarize a paper using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for summarization.
    :param topic: Topic of interest.
    :param pmcid: PMC ID of the paper.
    :param title: Title of the paper.
    :param content: Content of the paper.
    :return: Tuple of PMC ID, title, and summary of the paper.
    """
    # Set up query with paper
    query = "\n\n".join(
        [
            f'Please summarize in extreme detail the following paper titled "{title}". Please focus in particular on summarizing key insights about the topic "{topic}" in relation to designing SARS-CoV-2 nanobody binders.'
        ]
        + content
    )

    # Run query to get summary
    summary = await run_query(
        semaphore=semaphore,
        agent=agent,
        query=query,
    )

    return pmcid, title, summary

Use the agents to summarize each paper in parallel.

In [None]:
for topic, agent in topic_to_agent.items():
    topic_name = topic.replace(" ", "_")

    # Create save directory
    topic_summary_dir = summaries_dir / topic_name
    topic_summary_dir.mkdir(parents=True, exist_ok=True)

    # Load title to PMC ID dictionary
    with open(finetuning_dir / f"{topic_name}_title_to_pmcid.json") as f:
        title_to_pmcid: dict[str, str] = json.load(f)

    # Get unique PMC IDs
    pmcids = sorted(set((title_to_pmcid.values())))

    # Load papers
    pmcid_to_paper = {}
    for pmcid in pmcids:
        paper_path = papers_dir / f"{pmcid}.json"

        if paper_path.exists():
            with open(paper_path) as f:
                paper: dict[str, str | list[str]] = json.load(f)
                pmcid_to_paper[pmcid] = paper

    print(f"Number of papers loaded for {topic}: {len(pmcid_to_paper):,}")

    # Limit papers by length
    pmcid_to_paper = {
        pmcid: paper
        for pmcid, paper in pmcid_to_paper.items()
        if sum(count_tokens(paragraph) for paragraph in paper["content"]) <= max_tokens
    }

    print(
        f"Number of papers after token limit of {max_tokens:,} for {topic}: {len(pmcid_to_paper):,}"
    )

    # Compute input token cost
    input_token_count = sum(
        count_tokens(paragraph)
        for paper in pmcid_to_paper.values()
        for paragraph in [paper["title"]] + paper["content"]
    )

    input_token_cost = compute_token_cost(
        model=base_model,
        input_token_count=input_token_count,
        output_token_count=0,
    )

    print(f"Approximate input token cost for {topic} = ${input_token_cost:.2f}")

    # Set up semaphore with the number of concurrent requests
    semaphore = asyncio.Semaphore(num_concurrent)

    # Create tasks for each paper
    tasks = [
        asyncio.create_task(
            summarize_paper(
                semaphore=semaphore,
                agent=agent,
                topic=topic,
                pmcid=pmcid,
                title=paper["title"],
                content=paper["content"],
            )
        )
        for pmcid, paper in pmcid_to_paper.items()
    ]

    # Run agent summary for each paper
    results = [
        (await task) for task in tqdm(asyncio.as_completed(tasks), total=len(tasks))
    ]

    # Save summaries
    for pmcid, title, summary in results:
        with open(topic_summary_dir / f"{pmcid}.json", "w") as f:
            json.dump(
                {"pmcid": pmcid, "title": title, "summary": summary},
                f,
                indent=4,
                sort_keys=True,
            )

Convert the summaries to the format required for fine-tuning, aggregated by topic.

In [None]:
for topic, agent in topic_to_agent.items():
    # Convert summaries to training data format
    training_data = []

    topic_name = topic.replace(" ", "_")

    # Get summary paths
    topic_summary_dir = summaries_dir / topic_name
    summary_paths = sorted(topic_summary_dir.glob("*.json"))

    for summary_path in summary_paths:
        # Load paper summary data
        with open(summary_path) as f:
            summary_data = json.load(f)

        # Extract title and summary
        title, summary = summary_data["title"], summary_data["summary"]

        # Add example to training data
        training_data.append(
            {
                "messages": [
                    {"role": "system", "content": agent.prompt},
                    {
                        "role": "user",
                        "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing SARS-CoV-2 nanobody binders.',
                    },
                    {"role": "assistant", "content": summary},
                ]
            }
        )

    # Shuffle training data
    random = Random(0)
    random.shuffle(training_data)

    # Count tokens
    token_count = sum(
        count_tokens(message["content"])
        for data in training_data
        for message in data["messages"]
    )

    # Determine finetuning cost
    finetuning_cost = compute_finetuning_cost(
        model=base_model_mini,
        token_count=token_count,
        num_epochs=DEFAULT_FINETUNING_EPOCHS,
    )

    # Print stats
    print(f"Number of paper examples for {topic}: {len(training_data):,}")
    print(f"Token count for {topic}: {token_count:,}")
    print(f"Finetuning cost for {topic} using {base_model_mini}: ${finetuning_cost:.2f}")
    print()

    # Save training data in jsonl format
    with open(
        finetuning_dir / f"{topic.replace(' ', '_')}_training_data.jsonl", "w"
    ) as f:
        f.write("\n".join(json.dumps(example) for example in training_data))

Convert the summaries to the format required for fine-tuning, aggregated by agent.

In [None]:
for agent, topics in agent_to_topics.items():
    # Convert summaries to training data format
    training_data = []

    for topic in topics:
        topic_name = topic.replace(" ", "_")

        # Get summary paths
        topic_summary_dir = summaries_dir / topic_name
        summary_paths = sorted(topic_summary_dir.glob("*.json"))

        for summary_path in summary_paths:
            # Load paper summary data
            with open(summary_path) as f:
                summary_data = json.load(f)

            # Extract title and summary
            title, summary = summary_data["title"], summary_data["summary"]

            # Add example to training data
            training_data.append(
                {
                    "messages": [
                        {"role": "system", "content": agent.prompt},
                        {
                            "role": "user",
                            "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing SARS-CoV-2 nanobody binders.',
                        },
                        {"role": "assistant", "content": summary},
                    ]
                }
            )

    # Shuffle training data
    random = Random(0)
    random.shuffle(training_data)

    # Count tokens
    token_count = sum(
        count_tokens(message["content"])
        for data in training_data
        for message in data["messages"]
    )

    # Determine finetuning cost
    finetuning_cost = compute_finetuning_cost(
        model=base_model_mini,
        token_count=token_count,
        num_epochs=DEFAULT_FINETUNING_EPOCHS,
    )

    # Print stats
    print(f"Number of paper examples for {agent.title}: {len(training_data):,}")
    print(f"Token count for {agent.title}: {token_count:,}")
    print(f"Finetuning cost for {agent.title} using {base_model_mini}: ${finetuning_cost:.2f}")
    print()

    # Save training data in jsonl format
    with open(
        finetuning_dir / f"{agent.title.replace(' ', '_')}_training_data.jsonl", "w"
    ) as f:
        f.write("\n".join(json.dumps(example) for example in training_data))

Convert the summaries to the format required for fine-tuning, aggregated across all topics.

In [None]:
# Convert summaries to training data format
training_data = []

for topic in topic_to_agent:
    topic_name = topic.replace(" ", "_")

    # Get summary paths
    topic_summary_dir = summaries_dir / topic_name
    summary_paths = sorted(topic_summary_dir.glob("*.json"))

    for summary_path in summary_paths:
        # Load paper summary data
        with open(summary_path) as f:
            summary_data = json.load(f)

        # Extract title and summary
        title, summary = summary_data["title"], summary_data["summary"]

        # Add example to training data
        training_data.append(
            {
                "messages": [
                    {"role": "system", "content": generic_agent.prompt},
                    {
                        "role": "user",
                        "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing SARS-CoV-2 nanobody binders.',
                    },
                    {"role": "assistant", "content": summary},
                ]
            }
        )

# Shuffle training data
random = Random(0)
random.shuffle(training_data)

# Count tokens
token_count = sum(
    count_tokens(message["content"])
    for data in training_data
    for message in data["messages"]
)

# Determine finetuning cost
finetuning_cost = compute_finetuning_cost(
    model=base_model_mini,
    token_count=token_count,
    num_epochs=DEFAULT_FINETUNING_EPOCHS,
)

# Print stats
print(f"Number of paper examples: {len(training_data):,}")
print(f"Token count: {token_count:,}")
print(f"Finetuning cost using {base_model_mini}: ${finetuning_cost:.2f}")
print()

# Save training data in jsonl format
with open(
    finetuning_dir / "all_generic_training_data.jsonl", "w"
) as f:
    f.write("\n".join(json.dumps(example) for example in training_data))

## Fine-tuning

Upload topic summaries as fine-tuning data and save a mapping from topic to data ID.

In [17]:
topic_to_id = {}

for topic in topic_to_agent:
    path = finetuning_dir / f"{topic.replace(' ', '_')}_training_data.jsonl"

    file_object = client.files.create(file=open(path, "rb"), purpose="fine-tune")

    topic_to_id[topic] = file_object.id

with open(finetuning_dir / "topic_to_id.json", "w") as f:
    json.dump(topic_to_id, f)

Upload agent summaries as fine-tuning data and save a mapping from agent to data ID.

In [None]:
agent_title_to_id = {}

for agent in agent_to_topics:
    path = finetuning_dir / f"{agent.title.replace(' ', '_')}_training_data.jsonl"

    file_object = client.files.create(file=open(path, "rb"), purpose="fine-tune")

    agent_title_to_id[agent.title] = file_object.id

with open(finetuning_dir / "agent_title_to_id.json", "w") as f:
    json.dump(agent_title_to_id, f)

Upload generic summaries as fine-tuning data and save a mapping from generic agent to data ID.

In [129]:
all_generic_to_id = {}

path = finetuning_dir / "all_generic_training_data.jsonl"

file_object = client.files.create(file=open(path, "rb"), purpose="fine-tune")

all_generic_to_id["all_generic"] = file_object.id

with open(finetuning_dir / "all_generic_to_id.json", "w") as f:
    json.dump(all_generic_to_id, f)

Load mapping from topic to data ID.

In [18]:
with open(finetuning_dir / "topic_to_id.json") as f:
    topic_to_id = json.load(f)

Load mapping from agent to data ID.

In [None]:
with open(finetuning_dir / "agent_title_to_id.json") as f:
    agent_title_to_id = json.load(f)

Load mapping from generic to data ID.

In [None]:
with open(finetuning_dir / "all_generic_to_id.json") as f:
    all_generic_to_id = json.load(f)

Launch fine-tuning jobs by topic.

In [113]:
for model in [base_model, base_model_mini]:
    for topic, file_id in topic_to_id.items():
        client.fine_tuning.jobs.create(
            training_file=file_id,
            model=model,
            suffix=topic.replace(" ", "_"),
        )

Launch fine-tuning jobs by agent.

In [None]:
for model in [base_model, base_model_mini]:
    for agent_title, file_id in agent_title_to_id.items():
        client.fine_tuning.jobs.create(
            training_file=file_id,
            model=model,
            suffix=agent_title.replace(" ", "_"),
        )

Launch fine-tuning job for generic agent.

In [132]:
for model in [base_model, base_model_mini]:
    client.fine_tuning.jobs.create(
        training_file=all_generic_to_id["all_generic"],
        model=model,
        suffix="all_generic",
    )

Check fine-tuning job status.

In [None]:
print(list(client.fine_tuning.jobs.list())[1].fine_tuned_model)

Set up mapping from agent to fine-tuned model ID.

In [15]:
topic_to_model = {
    "nanobodies": "ft:gpt-4o-2024-08-06:personal:nanobodies:Azf4Ybts",
    "SARS-CoV-2 spike protein": "ft:gpt-4o-2024-08-06:personal:sars-cov-2-spike-protein:Azf4VrrH",
    "SARS-CoV-2 variants KP.3 and JN.1": "ft:gpt-4o-2024-08-06:personal:sars-cov-2-variants-kp-3-and-jn-1:Azes3SvG",
    "ESM": "ft:gpt-4o-2024-08-06:personal:esm:AzfGkSRg",
    "AlphaFold-Multimer": "ft:gpt-4o-2024-08-06:personal:alphafold-multimer:Azv64GGF",
    "Rosetta": "ft:gpt-4o-2024-08-06:personal:rosetta:AzuvllCG",
}

topic_to_model_mini = {
    "nanobodies": "ft:gpt-4o-mini-2024-07-18:personal:nanobodies:AzeZ4afv",
    "SARS-CoV-2 spike protein": "ft:gpt-4o-mini-2024-07-18:personal:sars-cov-2-spike-protein:AzeYQ1YM",
    "SARS-CoV-2 variants KP.3 and JN.1": "ft:gpt-4o-mini-2024-07-18:personal:sars-cov-2-variants-kp-3-and-jn-1:AzeSvPAe",
    "ESM": "ft:gpt-4o-mini-2024-07-18:personal:esm:AzeQwtOb",
    "AlphaFold-Multimer": "ft:gpt-4o-mini-2024-07-18:personal:alphafold-multimer:AzeewRqW",
    "Rosetta": "ft:gpt-4o-mini-2024-07-18:personal:rosetta:Azeb8IiN",
}

agent_to_model = {
    immunologist: "ft:gpt-4o-2024-08-06:personal:immunologist:AzXfaYoL",
    machine_learning_specialist: "ft:gpt-4o-2024-08-06:personal:machine-learning-specialist:AzXIKoIS",
    computational_biologist: "ft:gpt-4o-2024-08-06:personal:computational-biologist:AzXNlGca",
}

agent_to_model_mini = {
    immunologist: "ft:gpt-4o-mini-2024-07-18:personal:immunologist:AzdyC4jP",
    machine_learning_specialist: "ft:gpt-4o-mini-2024-07-18:personal:machine-learning-specialist:AzdfrwIj",
    computational_biologist: "ft:gpt-4o-mini-2024-07-18:personal:computational-biologist:AzdlENYb",
}

generic_model = "ft:gpt-4o-2024-08-06:personal:all-generic:B0DBcePr"
generic_model_mini = "ft:gpt-4o-mini-2024-07-18:personal:all-generic:B0CaviDa"

## Evaluate

Create a set of question and answer pairs to evaluate the fine-tuned models based on the summaries.

In [9]:
async def generate_qa_pairs(
    semaphore: asyncio.Semaphore, agent: Agent, pmcid: str, summary: str
) -> tuple[str, str]:
    """Generate question and answer pairs based on the summary using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for summarization.
    :param pmcid: PMC ID of the paper.
    :param summary: Summary of the paper.
    :return: Tuple of PMC ID and response (string of question and answer pairs in JSON dictionary form).
    """
    # Set up query with summary
    query = f'Please generate five unique questions based on the following summary of a scientific paper. The questions should be designed so that they are specific to the summary rather than general knowledge, i.e., they can only be answered with information in the summary. However, the questions must include enough context so that they can be answered without knowing which specific paper is being referred to (i.e., do not say things like "in this study" or "in the paper"). For the answers, please write one correct answer and three incorrect answers for each question. Make the incorrect answers as plausible and as similar to the correct answer as possible while still being incorrect so that selecting the correct answer is challenging without specific knowledge of the paper summary. Your response should *only* be a JSON dictionary mapping from question (string) to answers (list of four strings where the first string is the correct answer and the other three are incorrect). Here is the paper summary:\n\n{summary}'

    # Run query to get questions and answers
    response = await run_query(
        semaphore=semaphore,
        agent=agent,
        query=query,
    )

    return pmcid, response

In [None]:
for topic, agent in topic_to_agent.items():
    topic_name = topic.replace(" ", "_")

    # Create save directory
    topic_qa_dir = qa_dir / topic_name
    topic_qa_dir.mkdir(parents=True, exist_ok=True)

    # Get summary paths for the topic
    summary_paths = sorted((summaries_dir / topic_name).glob("*.json"))

    # Load PMCID to summary mapping
    pmcid_to_summary = {}
    for summary_path in summary_paths:
        # Load paper summary data
        with open(summary_path) as f:
            summary_data = json.load(f)

        # Extract PMCID and summary
        pmcid, summary = summary_data["pmcid"], summary_data["summary"]

        # Add PMCID to summary mapping
        pmcid_to_summary[pmcid] = summary

    print(f"Number of summaries loaded for {topic}: {len(pmcid_to_summary):,}")

    # Compute input token cost
    input_token_count = sum(
        count_tokens(summary) for summary in pmcid_to_summary.values()
    )

    input_token_cost = compute_token_cost(
        model=base_model,
        input_token_count=input_token_count,
        output_token_count=0,
    )

    print(f"Approximate input token cost for {topic} = ${input_token_cost:.2f}")

    # Set up semaphore with the number of concurrent requests
    semaphore = asyncio.Semaphore(num_concurrent)

    # Create tasks for each summary
    tasks = [
        asyncio.create_task(
            generate_qa_pairs(
                semaphore=semaphore, agent=agent, pmcid=pmcid, summary=summary
            )
        )
        for pmcid, summary in pmcid_to_summary.items()
    ]

    # Run agent questions for each paper
    results = [
        (await task) for task in tqdm(asyncio.as_completed(tasks), total=len(tasks))
    ]

    # Parse JSON Q&A pairs
    pmcid_to_qa_pairs = {}
    for pmcid, qa_pairs in results:
        try:
            pmcid_to_qa_pairs[pmcid] = json.loads(
                qa_pairs.replace("```json", "").replace("```", "").strip()
            )
        except json.JSONDecodeError:
            print(f"Failed to parse JSON for {pmcid}")
            print(qa_pairs)

    # Save summaries
    for pmcid, qa_pairs in pmcid_to_qa_pairs.items():
        with open(topic_qa_dir / f"{pmcid}.json", "w") as f:
            json.dump(
                {"pmcid": pmcid, "qa_pairs": qa_pairs}, f, indent=4, sort_keys=True
            )

Function to use an agent to answer a question.

In [10]:
async def answer_question(semaphore: asyncio.Semaphore, agent: Agent, question: str, answers: list[str], model: str) -> \
tuple[
    str, str]:
    """Answer a question using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for answering the question.
    :param question: Question to ask.
    :param answers: List of answers to choose from in random order.
    :param model: Model to use for answering the question.
    :return: Tuple of question and selected answer.
    """
    # Set up query with question and answer options
    query = f"Please answer the following question based on the following list of answers. Only one answer is correct. Please select the correct answer and provide as your response the exact text of the answer and nothing else. Here is the question:\n\n{question}\n\nHere are the possible answers (choose the correct one):\n\n{'\n\n'.join(answers)}"

    # Run query to get answer
    answer = await run_query(
        semaphore=semaphore,
        agent=agent,
        query=query,
        model=model,
    )

    return question, answer

Evaluate the fine-tuned models on question and answer pairs.

In [None]:
max_num_qa_per_topic = 100

for topic, agent in topic_to_agent.items():
    topic_name = topic.replace(" ", "_")

    # Get Q&A paths for the topic
    qa_paths = sorted((qa_dir / topic_name).glob("*.json"))

    # Load Q&A pairs
    qa_pairs = {}
    for qa_path in qa_paths:
        # Load paper summary data
        with open(qa_path) as f:
            qa_data = json.load(f)

        # Add Q&A pairs
        qa_pairs |= qa_data["qa_pairs"]

    print(f"Number of Q&A pairs loaded for {topic}: {len(qa_pairs):,}")

    # Set up randomness
    random = Random(0)

    # Limit number of Q&A pairs with random sampling
    if len(qa_pairs) > max_num_qa_per_topic:
        qa_pairs_list = list(qa_pairs.items())
        random.shuffle(qa_pairs_list)
        qa_pairs = dict(qa_pairs_list[:max_num_qa_per_topic])
        print(
            f"Number of Q&A pairs after random sampling for {topic}: {len(qa_pairs):,}"
        )

    # Map questions to correct answers
    question_to_correct_answer = {
        question: answers[0] for question, answers in qa_pairs.items()
    }

    # Randomize order of answers
    for answers in qa_pairs.values():
        random.shuffle(answers)

    # Loop over agents/models
    for answer_agent, model in [
        (generic_agent, base_model),
        (generic_agent, base_model_mini),
        (generic_agent, generic_model),
        (generic_agent, generic_model_mini),
        (agent, base_model),
        (agent, base_model_mini),
        (agent, topic_to_model[topic]),
        (agent, topic_to_model_mini[topic]),
        (agent, agent_to_model[agent]),
        (agent, agent_to_model_mini[agent]),
    ]:
        print(f"Evaluating {answer_agent.title} with {model} for {topic}.")

        # Set up semaphore with the number of concurrent requests
        semaphore = asyncio.Semaphore(num_concurrent)

        # Create tasks for each question
        tasks = [
            asyncio.create_task(
                answer_question(
                    semaphore=semaphore,
                    agent=answer_agent,
                    question=question,
                    answers=answers,
                    model=model,
                )
            )
            for question, answers in qa_pairs.items()
        ]

        # Run agent answers for each paper
        results = [
            (await task) for task in tqdm(asyncio.as_completed(tasks), total=len(tasks))
        ]

        # Map questions to selected answers
        question_to_selected_answer = {question: answer for question, answer in results}

        # Compute accuracy
        accuracy = sum(
            question_to_correct_answer[question]
            == question_to_selected_answer[question]
            for question in question_to_correct_answer
        ) / len(question_to_correct_answer)

        print(f"Accuracy: {accuracy:.2%}")