In [1]:
from pathlib import Path

from tqdm import trange

from agent import Agent
from prompts import ANTIBODIES_CONTEXTS, ESM_ANTIBODIES_PAPER, PRINCIPAL_INVESTIGATOR
from run_individual_meeting import run_individual_meeting
from run_scientific_meeting import run_scientific_meeting
from utils import load_summaries

In [2]:
# Set up key parameters
team_lead = PRINCIPAL_INVESTIGATOR
num_iterations = 3
num_rounds = 3
save_dir = Path("antibody_design")
model = "gpt-4o"
discussion_paths = []
contexts = ANTIBODIES_CONTEXTS

In [3]:
# Ask PI to select team members
ANTIBODIES_TEAM_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. You need to select a team of scientists to help you with this project. Please select the team members you would like to invite to a discussion to design the antibody discovery approach. For each team member, please specify the following: 1. Title, 2. Expertise, 3. Goal, and 4. Role. Please use your own description as an example."

In [4]:
# Select team members
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_individual_meeting(
        team_member=team_lead,
        agenda=ANTIBODIES_TEAM_PROMPT,
        save_dir=save_dir / "team_selection",
        save_name=f"discussion_{iteration_num + 1}",
        contexts=contexts,
        model=model,
    )

Project Iterations:  33%|███▎      | 1/3 [00:16<00:32, 16.43s/it]

Input token count: 194
Output token count: 704
Max token length: 898
Cost: $0.01
Time: 0:16


Project Iterations:  67%|██████▋   | 2/3 [00:29<00:14, 14.23s/it]

Input token count: 194
Output token count: 623
Max token length: 817
Cost: $0.01
Time: 0:12


Project Iterations: 100%|██████████| 3/3 [00:41<00:00, 13.73s/it]

Input token count: 194
Output token count: 622
Max token length: 816
Cost: $0.01
Time: 0:12





In [4]:
# Add team members based on discussion 2
COMPUTATIONAL_BIOLOGIST = Agent(
    title="Computational Biologist",
    expertise="bioinformatics, protein structure prediction, and molecular dynamics simulations",
    goal="to identify potential antibody candidates using computational methods and predict their binding affinity to the SARS-CoV-2 spike protein",
    role="to run in silico screenings of antibody libraries, modeling antibody-spike protein interactions, and providing a shortlist of promising candidates for experimental validation",
)
IMMUNOLOGIST = Agent(
    title="Immunologist",
    expertise="immune response mechanisms, antibody generation, and characterization",
    goal="to guide the selection of antibody candidates based on immunological principles and ensure the candidates have the potential for broad-spectrum activity",
    role="to provide insights into the immune response to SARS-CoV-2, help design the antibody generation strategy, and interpret the results from binding and neutralization assays",
)
STRUCTURAL_BIOLOGIST = Agent(
    title="Structural Biologist",
    expertise="X-ray crystallography, cryo-electron microscopy (cryo-EM), and protein structure determination",
    goal="to determine the high-resolution structures of antibody-spike protein complexes to understand the binding mechanisms and improve antibody design",
    role="to solve the structures of selected antibody-spike protein complexes and providing structural insights to refine antibody candidates",
)
VIROLOGIST = Agent(
    title="Virologist",
    expertise="SARS-CoV-2 biology, viral entry mechanisms, and neutralization assays",
    goal="to validate the neutralizing activity of antibody candidates against SARS-CoV-2 and its variants",
    role="to oversee the design and execution of neutralization assays, interpret the results, and ensure that the selected antibodies are effective against multiple variants of the virus",
)
DATA_SCIENTIST = Agent(
    title="Data Scientist",
    expertise="machine learning, data analysis, and predictive modeling",
    goal="to develop and apply machine learning models to predict the efficacy and broad-spectrum activity of antibody candidates",
    role="to analyze experimental data, develop predictive models, and integrate data from various sources to guide the selection and optimization of antibody candidates",
)
EXPERIMENTAL_BIOLOGIST = Agent(
    title="Experimental Biologist",
    expertise="antibody engineering, protein expression, and purification",
    goal="to produce and characterize the antibody candidates identified through computational and immunological methods",
    role="to express, purify, and initially characterize antibody candidates, as well as to prepare samples for binding and neutralization assays",
)

team_members = (
    COMPUTATIONAL_BIOLOGIST,
    IMMUNOLOGIST,
    STRUCTURAL_BIOLOGIST,
    VIROLOGIST,
    DATA_SCIENTIST,
    EXPERIMENTAL_BIOLOGIST,
)

In [5]:
ANTIBODIES_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. Please design a specific antibody discovery approach for this target that uses machine learning to design antibody candidates. Decide what specific machine learning model to use and precisely how it will be used. If the model needs to be trained, please decide on a specific dataset for training. If the model is pre-trained, please explain exactly how it will be used for identifying or designing antibody candidates."

In [6]:
ANTIBODIES_QUESTIONS = (
    "Will you design the antibodies de novo or will you modify existing antibodies?",
    "Will you train a model from scratch or use a pre-trained model?",
    "What specific model architecture will you use?",
    "If training a model, what dataset will you use for training?",
    "How exactly will you use your model to design antibodies?"
)

In [None]:
# Run antibodies project design
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=team_members,
        agenda=ANTIBODIES_PROMPT,
        agenda_questions=ANTIBODIES_QUESTIONS,
        contexts=contexts,
        save_dir=save_dir / "project_design",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=num_rounds,
        model=model,
    )

In [7]:
ESM_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. You will use the ESM family of models for antibody design. Please design a method for applying ESM to this antibody design problem. Specify the exact model you will use and how you will use it to design antibodies."

In [8]:
ESM_QUESTIONS = (
    "Which ESM model will you use?"
    "Will you design the antibodies de novo or will you modify existing antibodies?",
    "If modifying existing antibodies, which precise antibody or antibodies will you modify?",
    "How exactly will you use your model to design antibodies?",
    "What is the precise process for designing antibodies and selecting candidates for experimental validation?",
    "How will you computationally evaluate the quality of the design antibodies?",
    "What objectives will you optimize for in the design process?",
)

In [None]:
# Run ESM project design
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=team_members,
        agenda=ESM_PROMPT,
        agenda_questions=ESM_QUESTIONS,
        contexts=contexts,
        save_dir=save_dir / "esm",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=num_rounds,
        model=model,
    )

In [9]:
# Select preferred summary
discussion_paths.append(save_dir / "esm" / "discussion_2.json")

In [10]:
# Load summaries
summaries = load_summaries(discussion_paths=discussion_paths)

In [11]:
ESM_IMPLEMENT_PROMPT = "You now need to implement an ESM-based approach for antibody design. Please write code to implement the method you designed in the previous discussion. Your code must be self-contained (with appropriate imports), and you must explain how the code works."

In [None]:
# Run ESM implement
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_individual_meeting(
        team_member=DATA_SCIENTIST,
        agenda=ESM_IMPLEMENT_PROMPT,
        summaries=summaries,
        contexts=contexts,
        save_dir=save_dir / "esm_implement",
        save_name=f"discussion_{iteration_num + 1}",
        model=model,
    )

In [16]:
ESM_REIMPLEMENT_PROMPT = f"Above is the full text from a recent scientific paper describing the use of ESM, a pre-trained protein language model, to design antibodies. Please read the paper and then describe how to reproduce the results in the paper. {PRINCIPAL_INVESTIGATOR}, please delegate different aspects of the task to your team members based on their expertise. Team members, please provide complete, specific detail (no missing information) for how to reproduce your assigned portion of the paper. This could be a step-by-step set of instructions or experimental protocol or a complete, fully runnable Python script. Alongside your procedure, you must quote directly from the paper to explain each aspect of your procedure. {PRINCIPAL_INVESTIGATOR}, at the end of the discussion in your summary, please describe how to fully reproduce the paper by combining the procedures outlined by each of your team members."

In [17]:
ESM_REIMPLEMENT_QUESTIONS = (
    "How will you obtain the antibody sequences that were evolved in the paper?",
    "What portion of these sequences will you evolve in order to match the paper?",
    "How will you computationally evolve these sequences using the same methods as in the paper?",
    "How will you experimentally validate the designed antibodies using the same methods as in the paper?",
)

In [18]:
ESM_REIMPLEMENT_TEAM = (
    VIROLOGIST,
    IMMUNOLOGIST,
    DATA_SCIENTIST,
    EXPERIMENTAL_BIOLOGIST,
)

In [19]:
# Run ESM reimplement
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=ESM_REIMPLEMENT_TEAM,
        agenda=ESM_REIMPLEMENT_PROMPT,
        agenda_questions=ESM_REIMPLEMENT_QUESTIONS,
        contexts=(ESM_ANTIBODIES_PAPER,),
        save_dir=save_dir / "esm_reimplement",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=1,
        model=model,
    )

Project Iterations:   0%|          | 0/3 [00:00<?, ?it/s]
Rounds (+ Summary Round):   0%|          | 0/2 [00:00<?, ?it/s][A

Team:   0%|          | 0/5 [00:00<?, ?it/s][A[A

Team:  20%|██        | 1/5 [00:10<00:41, 10.30s/it][A[A

Team:  40%|████      | 2/5 [00:28<00:44, 14.90s/it][A[A

Team:  60%|██████    | 3/5 [00:37<00:24, 12.03s/it][A[A

Team:  80%|████████  | 4/5 [00:55<00:14, 14.51s/it][A[A

Team: 100%|██████████| 5/5 [01:19<00:00, 15.88s/it][A[A

Rounds (+ Summary Round):  50%|█████     | 1/2 [01:19<01:19, 79.43s/it][A

Team:   0%|          | 0/5 [00:25<?, ?it/s][A[A

Rounds (+ Summary Round): 100%|██████████| 2/2 [01:45<00:00, 52.61s/it][A
Project Iterations:  33%|███▎      | 1/3 [01:45<03:30, 105.23s/it]

Input token count: 158,223
Output token count: 3,951
Max token length: 29,706
Cost: $0.85
Time: 1:45



Rounds (+ Summary Round):   0%|          | 0/2 [00:00<?, ?it/s][A

Team:   0%|          | 0/5 [00:00<?, ?it/s][A[A

Team:  20%|██        | 1/5 [00:10<00:42, 10.65s/it][A[A

Team:  40%|████      | 2/5 [00:18<00:27,  9.25s/it][A[A

Team:  60%|██████    | 3/5 [00:30<00:20, 10.27s/it][A[A

Team:  80%|████████  | 4/5 [00:48<00:13, 13.41s/it][A[A

Team: 100%|██████████| 5/5 [01:16<00:00, 15.22s/it][A[A

Rounds (+ Summary Round):  50%|█████     | 1/2 [01:16<01:16, 76.09s/it][A

Team:   0%|          | 0/5 [00:32<?, ?it/s][A[A

Rounds (+ Summary Round): 100%|██████████| 2/2 [01:48<00:00, 54.33s/it][A
Project Iterations:  67%|██████▋   | 2/3 [03:33<01:47, 107.25s/it]

Input token count: 160,175
Output token count: 4,945
Max token length: 30,700
Cost: $0.88
Time: 1:48



Rounds (+ Summary Round):   0%|          | 0/2 [00:00<?, ?it/s][A

Team:   0%|          | 0/5 [00:00<?, ?it/s][A[A

Team:  20%|██        | 1/5 [00:09<00:37,  9.30s/it][A[A

Team:  40%|████      | 2/5 [00:23<00:36, 12.02s/it][A[A

Team:  60%|██████    | 3/5 [00:36<00:24, 12.42s/it][A[A

Team:  80%|████████  | 4/5 [00:52<00:14, 14.03s/it][A[A

Team: 100%|██████████| 5/5 [01:19<00:00, 15.97s/it][A[A

Rounds (+ Summary Round):  50%|█████     | 1/2 [01:19<01:19, 79.87s/it][A

Team:   0%|          | 0/5 [00:23<?, ?it/s][A[A

Rounds (+ Summary Round): 100%|██████████| 2/2 [01:43<00:00, 51.66s/it][A
Project Iterations: 100%|██████████| 3/3 [05:17<00:00, 105.74s/it]

Input token count: 160,730
Output token count: 4,627
Max token length: 30,382
Cost: $0.87
Time: 1:43





In [None]:
# TODO: extend beyond ESM paper after reimplementation is successful