In [None]:
import concurrent.futures
from pathlib import Path

from virtual_lab.agent import Agent
from virtual_lab.constants import CONSISTENT_TEMPERATURE, CREATIVE_TEMPERATURE
from virtual_lab.prompts import (
    PRINCIPAL_INVESTIGATOR,
    SCIENTIFIC_CRITIC,
    create_merge_prompt,
)
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import load_summaries

In [None]:
# Set up key parameters
num_iterations = 5
num_rounds = 3
save_dir = Path("drug_discovery/discussions")
model = "gpt-4o-2024-08-06"
background_prompt = "You are working on a research project to use machine learning for drug discovery. Your goals are the following: (1) the project must have high clinical value, meaning the research contributes to helping patients, (2) the project must include a scientifically impactful application of machine learning to drug discovery, and (3) the project must use Emerald Cloud Labs (ECL) for all experimental validation with a 3-month limit on experiments."

In [None]:
with open("drug_discovery/emerald/running_experiments.txt") as f:
    ECL_RUNNING_EXPERIMENTS = f.read().replace("\n", "\n\n")

with open("drug_discovery/emerald/unit_operations.txt") as f:
    ECL_UNIT_OPERATIONS = f.read().replace("\n", "\n\n")

ECL_CONTEXT = "You have access to Emerald Cloud Labs (ECL), a cloud lab provider that can run automated biology experiments. The full list of experiments and unit operations available at ECL are below. Please note that ECL currently cannot work with cell cultures and cannot synthesize small molecule drugs."

DRUG_DISCOVERY_CONTEXTS = (
    ECL_CONTEXT,
    ECL_RUNNING_EXPERIMENTS,
    ECL_UNIT_OPERATIONS,
)

## Select team members

In [None]:
# Select team members - prompts
team_selection_dir = save_dir / "team_selection"

team_selection_agenda = f"""{background_prompt} You need to select a team of three scientists to help you with this project. Please select the team members that you would like to invite to work on the project. Please list the team members in the following format, using the team member below as an example. You should not include yourself (Principal Investigator) in the list.

Agent(
    title="Principal Investigator",
    expertise="applying artificial intelligence to biomedical research",
    goal="perform research in your area of expertise that maximizes the scientific impact of the work",
    role="lead a team of experts to solve an important problem in artificial intelligence for biomedicine, make key decisions about the project direction based on team member input, and manage the project timeline and resources",
)
"""

In [None]:
# Select team members - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=PRINCIPAL_INVESTIGATOR,
            agenda=team_selection_agenda,
            save_dir=team_selection_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Select team members - merge
team_selection_summaries = load_summaries(discussion_paths=list(team_selection_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(team_selection_summaries)}")

team_selection_merge_prompt = create_merge_prompt(agenda=team_selection_agenda)

run_meeting(
    meeting_type="individual",
    team_member=PRINCIPAL_INVESTIGATOR,
    summaries=team_selection_summaries,
    agenda=team_selection_merge_prompt,
    save_dir=team_selection_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
)

In [None]:
# Add team members
COMPUTATIONAL_BIOLOGIST = Agent(
    title="Computational Biologist",
    expertise="machine learning algorithms for drug target identification and validation",
    goal="develop and implement machine learning models for identifying potential drug targets with high clinical relevance",
    role="design and optimize machine learning models, collaborate with domain experts to ensure clinical applicability, and interpret the results from a biological perspective",
)

MEDICINAL_CHEMIST = Agent(
    title="Medicinal Chemist",
    expertise="drug design and synthesis with experience in virtual screening and molecular docking",
    goal="translate computational predictions into testable compounds and design experiments to validate these compounds using ECL",
    role="provide insights into chemical feasibility of predicted compounds, help prioritize compounds for synthesis, and coordinate experimental validation through ECL",
)

MACHINE_LEARNING_SPECIALIST = Agent(
    title="Machine Learning Specialist",
    expertise="advanced machine learning algorithms and their application in biomedical data",
    goal="design and implement innovative machine learning approaches to enhance drug discovery pipelines",
    role="lead the development of machine learning models, ensure their scientific rigor, and optimize them for predicting clinically relevant drug candidates",
)

team_members = (
    COMPUTATIONAL_BIOLOGIST,
    MEDICINAL_CHEMIST,
    MACHINE_LEARNING_SPECIALIST,
    SCIENTIFIC_CRITIC,
)

## Project Selection

In [None]:
# Project selection - prompts
project_selection_dir = save_dir / "project_selection"

project_selection_agenda = f"{background_prompt} In this meeting, you need to select a specific drug discovery project. You must first choose a drug modality that is most appropriate given the goals of the project. Then, you must select five specific disease/target pairs that you could develop a therapy for using your choice of drug modality."

project_selection_questions = (
    "What is the specific drug modality that you are proposing?",
    "What specific disease/target pairs are you proposing to treat with your choice of drug modality (list five)?",
    "Why are these diseases and targets appropriate for your choice of drug modality?",
)

In [None]:
# Project selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=PRINCIPAL_INVESTIGATOR,
            team_members=team_members,
            contexts=DRUG_DISCOVERY_CONTEXTS,
            agenda=project_selection_agenda,
            agenda_questions=project_selection_questions,
            save_dir=project_selection_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Project selection - merge
project_selection_summaries = load_summaries(discussion_paths=list(project_selection_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(project_selection_summaries)}")

project_selection_merge_prompt = create_merge_prompt(
    agenda=project_selection_agenda,
    agenda_questions=project_selection_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=PRINCIPAL_INVESTIGATOR,
    contexts=DRUG_DISCOVERY_CONTEXTS,
    summaries=project_selection_summaries,
    agenda=project_selection_merge_prompt,
    save_dir=project_selection_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
    num_rounds=num_rounds,
)

In [None]:
project_prompt = "Your team previously decided to pursue peptide-based therapeutics."

## Select tools

In [None]:
# Tools selection - prompts
tools_selection_dir = save_dir / "tools_selection"

tools_selection_agenda = f"{background_prompt} {project_prompt} Now you need to select machine learning tools to perform this drug discovery project. Please list several of the latest pre-trained generative machine learning models (~5) that could be used for de novo design of peptide-based therapeutics for the diseases/targets you previously selected. Please note that machine learning models for protein design, such as diffusion models for de novo protein design, are also relevant as they can be applied to peptide design."

tools_selection_questions = (
    "What are the names of the latest pre-trained generative machine learning models that could be used for de novo design of peptide-based therapeutics (list ~5)?",
    "For each model, how could it be used in the design process for creating de novo peptide-based therapeutics for the diseases/targets you previously selected?",
)

tools_selection_prior_summaries = load_summaries(discussion_paths=[project_selection_dir / "merged.json"])
print(f"Number of prior summaries: {len(tools_selection_prior_summaries)}")

In [None]:
# Tools selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=PRINCIPAL_INVESTIGATOR,
            team_members=team_members,
            summaries=tools_selection_prior_summaries,
            agenda=tools_selection_agenda,
            agenda_questions=tools_selection_questions,
            save_dir=tools_selection_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Tools selection - merge
tools_selection_summaries = load_summaries(discussion_paths=list(tools_selection_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(tools_selection_summaries)}")

tools_selection_merge_prompt = create_merge_prompt(
    agenda=tools_selection_agenda,
    agenda_questions=tools_selection_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=PRINCIPAL_INVESTIGATOR,
    summaries=tools_selection_summaries,
    agenda=tools_selection_merge_prompt,
    save_dir=tools_selection_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
    num_rounds=num_rounds,
)

In [None]:
tools_prompt = "Your team previously suggested using RFDiffusion, ProteinMPNN, and AlphaFold2 to design peptide-based therapeutics."

# Literature review

In [None]:
# Literature review - prompts
literature_review_dir = save_dir / "literature_review"

tool_to_title = {
    "RFDiffusion": "De novo design of protein structure and function with RFdiffusion",
    "ProteinMPNN": "Robust deep learning based protein sequence design using ProteinMPNN",
    "AlphaFold2": "Highly accurate protein structure prediction with AlphaFold",
}
literature_review_agendas = {
    tool: f"{background_prompt} {project_prompt} {tools_prompt} Now you need to read the paper on {tool} and summarize in detail exactly how the model works, what data it requires as input, and what kind of output it produces. You should tailor each part of your summary to focus on how the model could apply to designing peptide binders for a given protein target. To read the paper, perform a PubMed search with the following paper title (include the quotes in your query): \"{title}\"."
    for tool, title in tool_to_title.items()
}

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=MACHINE_LEARNING_SPECIALIST,
            agenda=literature_review_agendas[tool],
            save_dir=literature_review_dir,
            save_name=tool,
            temperature=CONSISTENT_TEMPERATURE,
            model=model,
            pubmed_search=True,
        ) for tool in tool_to_title
    ])

## Computational workflow

In [None]:
# Computational workflow - prompts
computational_workflow_dir = save_dir / "computational_workflow"

computational_workflow_agenda = f"{background_prompt} {project_prompt} {tools_prompt} Now you need to create a specific computational workflow for designing peptide therapeutics using these tools. Please explain in detail how you will use RFDiffusion, ProteinMPNN, and AlphaFold2 to design peptide binders for the targets you previously selected. Include the specific steps involved in the workflow, the input data required for each tool, and the expected output from each tool. Do not incorporate any tools besides these three."

computational_workflow_questions = (
    "What is the specific computational workflow for designing peptide therapeutics using RFDiffusion, ProteinMPNN, and AlphaFold2?",
    "What is the role of RFDiffusion in the workflow, and what are the inputs and outputs of the model?",
    "What is the role of ProteinMPNN in the workflow, and what are the inputs and outputs of the model?",
    "What is the role of AlphaFold2 in the workflow, and what are the inputs and outputs of the model?",
)

computational_workflow_prior_summaries = load_summaries(discussion_paths=[project_selection_dir / "merged.json", tools_selection_dir / "merged.json"] + list(literature_review_dir.glob("*.json")))
print(f"Number of prior summaries: {len(computational_workflow_prior_summaries)}")

In [None]:
# Computational workflow - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=PRINCIPAL_INVESTIGATOR,
            team_members=team_members,
            summaries=computational_workflow_prior_summaries,
            agenda=computational_workflow_agenda,
            agenda_questions=computational_workflow_questions,
            save_dir=computational_workflow_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Computational workflow - merge
computational_workflow_summaries = load_summaries(discussion_paths=list(computational_workflow_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(computational_workflow_summaries)}")

computational_workflow_merge_prompt = create_merge_prompt(
    agenda=computational_workflow_agenda,
    agenda_questions=computational_workflow_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=PRINCIPAL_INVESTIGATOR,
    summaries=computational_workflow_summaries,
    agenda=computational_workflow_merge_prompt,
    save_dir=computational_workflow_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
    num_rounds=num_rounds,
)

## Computational details

In [None]:
# Computational details - prompts
computational_details_dir = save_dir / "computational_details"

computational_details_agenda = f"{background_prompt} {project_prompt} {tools_prompt} Now you need to provide more details for the computational workflow you previously defined. Please answer the agenda questions to fill in the missing details for the computational workflow. Note that the peptide synthesizer at ECL can synthesize up to 12 peptides simultaneously."

computational_details_questions = (
    "How will you identify target structures and binding sites to provide as input to RFDiffusion for designing peptide binders?",
    "How many peptide backbone structures will you design for each target using RFDiffusion?",
    "How will you decide the length of the peptides to design for each target using RFDiffusion?",
    "How many peptide sequences will you generate for each peptide backbone structure using ProteinMPNN?",
    "After predicting peptide-target complexes using AlphaFold2, what criteria will you use to select peptides for synthesis and validation and how many will you select?",
)

computational_details_prior_summaries = load_summaries(discussion_paths=[project_selection_dir / "merged.json", tools_selection_dir / "merged.json"] + list(literature_review_dir.glob("*.json")) + [computational_workflow_dir / "merged.json"])
print(f"Number of prior summaries: {len(computational_details_prior_summaries)}")

In [None]:
# Computational details - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=COMPUTATIONAL_BIOLOGIST,
            summaries=computational_details_prior_summaries,
            agenda=computational_details_agenda,
            agenda_questions=computational_details_questions,
            save_dir=computational_details_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Computational details - merge
computational_details_summaries = load_summaries(discussion_paths=list(computational_details_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(computational_details_summaries)}")

computational_details_merge_prompt = create_merge_prompt(
    agenda=computational_details_agenda,
    agenda_questions=computational_details_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=COMPUTATIONAL_BIOLOGIST,
    summaries=computational_details_summaries,
    agenda=computational_details_merge_prompt,
    save_dir=computational_details_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
)

## Experimental workflow

In [None]:
# Experimental workflow - prompts
experimental_workflow_dir = save_dir / "experimental_workflow"

experimental_workflow_agenda = f"{background_prompt} {project_prompt} {tools_prompt} Now you need to create a specific experimental workflow for synthesizing and validating the peptides that your team designs computationally for each drug target. First, in general terms, explain the types of experiments that you will perform to synthesize and validate the peptides. Ensure that you explain what form or forms of validation you will perform in order to verify the utility of the peptides for the given protein target and disease. Then, write a high-level protocol for synthesis and for each form of validation that names specific ECL experiments that should be run in order. Remember that you are constrained to using ECL for all experimental validation, but your validation should still be as comprehensive and scientifically rigorous as possible."

experimental_workflow_questions = (
    "What general types of experiments will you perform to synthesize the peptides in preparation for validation?",
    "What is the high-level protocol (name ECL experiments in order) you will run to synthesize the peptides in preparation for validation?",
    "What general types of experiments will you perform to validate the synthesized peptides?",
    "What is the high-level protocol (name ECL experiments in order) you will run for each form of validation of the synthesized peptides?",
)

experimental_workflow_prior_summaries = load_summaries(discussion_paths=[project_selection_dir / "merged.json", tools_selection_dir / "merged.json"] + list(literature_review_dir.glob("*.json")))
print(f"Number of prior summaries: {len(experimental_workflow_prior_summaries)}")

In [None]:
# Experimental workflow - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=PRINCIPAL_INVESTIGATOR,
            team_members=team_members,
            contexts=DRUG_DISCOVERY_CONTEXTS,
            summaries=experimental_workflow_prior_summaries,
            agenda=experimental_workflow_agenda,
            agenda_questions=experimental_workflow_questions,
            save_dir=experimental_workflow_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Experimental workflow - merge
experimental_workflow_summaries = load_summaries(discussion_paths=list(experimental_workflow_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(experimental_workflow_summaries)}")

experimental_workflow_merge_prompt = create_merge_prompt(
    agenda=experimental_workflow_agenda,
    agenda_questions=experimental_workflow_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=PRINCIPAL_INVESTIGATOR,
    contexts=DRUG_DISCOVERY_CONTEXTS,
    summaries=experimental_workflow_summaries,
    agenda=experimental_workflow_merge_prompt,
    save_dir=experimental_workflow_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
    num_rounds=num_rounds,
)

## Experiment details

In [None]:
# Experiment details - prompts
experiment_details_dir = save_dir / "experiment_details"

experiment_details_agenda = f"{background_prompt} {project_prompt} {tools_prompt} Now you need to write a detailed experimental protocol for each of the high-level protocols your team previously decided upon for peptide synthesis and for each form of peptide validation. Write each protocol in extreme detail, including the name of every ECL experiment or unit operation that should be run in order along with all the required inputs (peptides, protein targets, and reagents), equipment, and conditions for each step and the expected output. For all experimental inputs (peptides, protein targets, and reagents), please specify the precise quantities and concentrations that are required for each step of every protocol. Additionally, please comment on any required controls or quality assurance steps that should be taken for each protocol."

experimental_details_prior_summaries = load_summaries(discussion_paths=[project_selection_dir / "merged.json", tools_selection_dir / "merged.json"] + list(literature_review_dir.glob("*.json")) + [experimental_workflow_dir / "merged.json"])
print(f"Number of prior summaries: {len(experimental_details_prior_summaries)}")

In [None]:
# Experiment details - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=MEDICINAL_CHEMIST,
            contexts=DRUG_DISCOVERY_CONTEXTS,
            summaries=experimental_details_prior_summaries,
            agenda=experiment_details_agenda,
            save_dir=experiment_details_dir,
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            model=model,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Experiment details - merge
experiment_details_summaries = load_summaries(discussion_paths=list(experiment_details_dir.glob("discussion_*.json")))
print(f"Number of summaries: {len(experiment_details_summaries)}")

experiment_details_merge_prompt = create_merge_prompt(agenda=experiment_details_agenda)

run_meeting(
    meeting_type="individual",
    team_member=MEDICINAL_CHEMIST,
    contexts=DRUG_DISCOVERY_CONTEXTS,
    summaries=experiment_details_summaries,
    agenda=experiment_details_merge_prompt,
    save_dir=experiment_details_dir,
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    model=model,
)

## Protocols

In [None]:
# Protocols - prompts
protocols_dir = save_dir / "protocols"

protocols = [
    "ExperimentPeptideSynthesis",
    "ExperimentSolidPhaseExtraction",
    "ExperimentHPLC",
    "ExperimentLCMS",
    "ExperimentBioconjugation",
    "ExperimentNMR",
    "ExperimentCircularDichroism",
    "ExperimentBioLayerInterferometry",
    "ExperimentFluorescencePolarization",
    "ExperimentELISA",
    "ExperimentThermalShift",
]

protocol_to_agenda = {
    protocol: f"{background_prompt} {project_prompt} You previously designed experimental protocols for synthesis and validation of peptides that are designed to bind to a given protein drug target. Now you must implement one of these protocols using ECL. Please implement the {protocol} protocol using the appropriate commands and syntax from ECL, writing out the experiment in the form {protocol}[inputs] where you fill in \"inputs\" with the appropriate inputs. Please refer to the ECL documentation that is provided as context with example function calls and the full set of options for the experiment. In general, leave most options at their default values and therefore do not include them in your experiment function call. Only specify options that must be changed from their default value for your specific experiment. For every option that you specify, please state the default value, the different value you chose, and why you chose that value instead of the default."
    for protocol in protocols
}

protocols_prior_summaries = load_summaries(discussion_paths=[experiment_details_dir / "merged.json"])
print(f"Number of prior summaries: {len(protocols_prior_summaries)}")

In [None]:
protocol_to_context = {}

for protocol in protocols:
    with open(f"drug_discovery/emerald/{protocol}.txt") as f:
        protocol_to_context[protocol] = f.read().replace("\n", "\n\n")

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=MEDICINAL_CHEMIST,
            contexts=(protocol_to_context[protocol],),
            summaries=protocols_prior_summaries,
            agenda=protocol_to_agenda[protocol],
            save_dir=protocols_dir,
            save_name=protocol,
            temperature=CONSISTENT_TEMPERATURE,
            model=model,
        ) for protocol in protocols
    ])