In [1]:
from pathlib import Path

from tqdm import trange

from agent import Agent
from prompts import ANTIBODIES_CONTEXTS, ESM_ANTIBODIES_PAPER, PRINCIPAL_INVESTIGATOR
from run_individual_meeting import run_individual_meeting
from run_scientific_meeting import run_scientific_meeting
from utils import load_summaries

In [2]:
# Set up key parameters
team_lead = PRINCIPAL_INVESTIGATOR
num_iterations = 3
num_rounds = 3
save_dir = Path("antibody_design")
model = "gpt-4o"
discussion_paths = []
contexts = ANTIBODIES_CONTEXTS

In [3]:
# Ask PI to select team members
ANTIBODIES_TEAM_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. You need to select a team of scientists to help you with this project. Please select the team members you would like to invite to a discussion to design the antibody discovery approach. For each team member, please specify the following: 1. Title, 2. Expertise, 3. Goal, and 4. Role. Please use your own description as an example."

In [4]:
# Select team members
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_individual_meeting(
        team_member=team_lead,
        agenda=ANTIBODIES_TEAM_PROMPT,
        save_dir=save_dir / "team_selection",
        save_name=f"discussion_{iteration_num + 1}",
        contexts=contexts,
        model=model,
    )

Project Iterations:  33%|███▎      | 1/3 [00:16<00:32, 16.43s/it]

Input token count: 194
Output token count: 704
Max token length: 898
Cost: $0.01
Time: 0:16


Project Iterations:  67%|██████▋   | 2/3 [00:29<00:14, 14.23s/it]

Input token count: 194
Output token count: 623
Max token length: 817
Cost: $0.01
Time: 0:12


Project Iterations: 100%|██████████| 3/3 [00:41<00:00, 13.73s/it]

Input token count: 194
Output token count: 622
Max token length: 816
Cost: $0.01
Time: 0:12





In [4]:
# Add team members based on discussion 2
COMPUTATIONAL_BIOLOGIST = Agent(
    title="Computational Biologist",
    expertise="bioinformatics, protein structure prediction, and molecular dynamics simulations",
    goal="to identify potential antibody candidates using computational methods and predict their binding affinity to the SARS-CoV-2 spike protein",
    role="to run in silico screenings of antibody libraries, modeling antibody-spike protein interactions, and providing a shortlist of promising candidates for experimental validation",
)
IMMUNOLOGIST = Agent(
    title="Immunologist",
    expertise="immune response mechanisms, antibody generation, and characterization",
    goal="to guide the selection of antibody candidates based on immunological principles and ensure the candidates have the potential for broad-spectrum activity",
    role="to provide insights into the immune response to SARS-CoV-2, help design the antibody generation strategy, and interpret the results from binding and neutralization assays",
)
STRUCTURAL_BIOLOGIST = Agent(
    title="Structural Biologist",
    expertise="X-ray crystallography, cryo-electron microscopy (cryo-EM), and protein structure determination",
    goal="to determine the high-resolution structures of antibody-spike protein complexes to understand the binding mechanisms and improve antibody design",
    role="to solve the structures of selected antibody-spike protein complexes and providing structural insights to refine antibody candidates",
)
VIROLOGIST = Agent(
    title="Virologist",
    expertise="SARS-CoV-2 biology, viral entry mechanisms, and neutralization assays",
    goal="to validate the neutralizing activity of antibody candidates against SARS-CoV-2 and its variants",
    role="to oversee the design and execution of neutralization assays, interpret the results, and ensure that the selected antibodies are effective against multiple variants of the virus",
)
DATA_SCIENTIST = Agent(
    title="Data Scientist",
    expertise="machine learning, data analysis, and predictive modeling",
    goal="to develop and apply machine learning models to predict the efficacy and broad-spectrum activity of antibody candidates",
    role="to analyze experimental data, develop predictive models, and integrate data from various sources to guide the selection and optimization of antibody candidates",
)
EXPERIMENTAL_BIOLOGIST = Agent(
    title="Experimental Biologist",
    expertise="antibody engineering, protein expression, and purification",
    goal="to produce and characterize the antibody candidates identified through computational and immunological methods",
    role="to express, purify, and initially characterize antibody candidates, as well as to prepare samples for binding and neutralization assays",
)

team_members = (
    COMPUTATIONAL_BIOLOGIST,
    IMMUNOLOGIST,
    STRUCTURAL_BIOLOGIST,
    VIROLOGIST,
    DATA_SCIENTIST,
    EXPERIMENTAL_BIOLOGIST,
)

In [5]:
ANTIBODIES_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. Please design a specific antibody discovery approach for this target that uses machine learning to design antibody candidates. Decide what specific machine learning model to use and precisely how it will be used. If the model needs to be trained, please decide on a specific dataset for training. If the model is pre-trained, please explain exactly how it will be used for identifying or designing antibody candidates."

In [6]:
ANTIBODIES_QUESTIONS = (
    "Will you design the antibodies de novo or will you modify existing antibodies?",
    "Will you train a model from scratch or use a pre-trained model?",
    "What specific model architecture will you use?",
    "If training a model, what dataset will you use for training?",
    "How exactly will you use your model to design antibodies?"
)

In [7]:
# Run antibodies project design
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=team_members,
        agenda=ANTIBODIES_PROMPT,
        agenda_questions=ANTIBODIES_QUESTIONS,
        contexts=contexts,
        save_dir=save_dir / "project_design",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=num_rounds,
        model=model,
    )

Project Iterations:   0%|          | 0/3 [00:00<?, ?it/s]
Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:13<01:20, 13.42s/it][A[A

Team:  29%|██▊       | 2/7 [00:26<01:05, 13.17s/it][A[A

Team:  43%|████▎     | 3/7 [00:35<00:45, 11.30s/it][A[A

Team:  57%|█████▋    | 4/7 [00:54<00:42, 14.16s/it][A[A

Team:  71%|███████▏  | 5/7 [01:08<00:28, 14.39s/it][A[A

Team:  86%|████████▌ | 6/7 [01:25<00:15, 15.33s/it][A[A

Team: 100%|██████████| 7/7 [01:38<00:00, 14.10s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:38<04:56, 98.68s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:17<01:45, 17.56s/it][A[A

Team:  29%|██▊       | 2/7 [00:29<01:10, 14.00s/it][A[A

Team:  43%|████▎     | 3/7 [00:45<01:00, 15.01s/it][A[A

Team:  57%|█████▋    | 4/7 [00:57<00:41, 13.75s/it][A[A

Team:  71%|███████▏  | 5/7 [01:22<00:36, 18.01s

Input token count: 188,279
Output token count: 15,249
Max token length: 18,124
Cost: $1.17
Time: 6:34



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:10<01:05, 11.00s/it][A[A

Team:  29%|██▊       | 2/7 [00:28<01:13, 14.68s/it][A[A

Team:  43%|████▎     | 3/7 [00:39<00:52, 13.01s/it][A[A

Team:  57%|█████▋    | 4/7 [00:56<00:43, 14.51s/it][A[A

Team:  71%|███████▏  | 5/7 [01:23<00:38, 19.31s/it][A[A

Team:  86%|████████▌ | 6/7 [01:38<00:17, 17.79s/it][A[A

Team: 100%|██████████| 7/7 [01:50<00:00, 15.85s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:50<05:32, 110.98s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:18<01:50, 18.41s/it][A[A

Team:  29%|██▊       | 2/7 [00:27<01:04, 12.83s/it][A[A

Team:  43%|████▎     | 3/7 [00:35<00:43, 10.80s/it][A[A

Team:  57%|█████▋    | 4/7 [00:43<00:29,  9.68s/it][A[A

Team:  71%|███████▏  | 5/7 [00:51<00:17,  8.90s/it][A[A

Team:  86%|████████▌ | 6/7 [00:59<00:08,  8.

Input token count: 163,435
Output token count: 11,639
Max token length: 14,514
Cost: $0.99
Time: 4:52



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:14<01:29, 14.84s/it][A[A

Team:  29%|██▊       | 2/7 [00:34<01:28, 17.74s/it][A[A

Team:  43%|████▎     | 3/7 [00:45<00:58, 14.59s/it][A[A

Team:  57%|█████▋    | 4/7 [00:56<00:39, 13.28s/it][A[A

Team:  71%|███████▏  | 5/7 [01:13<00:28, 14.46s/it][A[A

Team:  86%|████████▌ | 6/7 [01:26<00:14, 14.15s/it][A[A

Team: 100%|██████████| 7/7 [01:45<00:00, 15.05s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:45<05:16, 105.37s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:15<01:35, 15.88s/it][A[A

Team:  29%|██▊       | 2/7 [00:27<01:06, 13.22s/it][A[A

Team:  43%|████▎     | 3/7 [00:41<00:54, 13.73s/it][A[A

Team:  57%|█████▋    | 4/7 [00:58<00:44, 14.90s/it][A[A

Team:  71%|███████▏  | 5/7 [01:14<00:30, 15.49s/it][A[A

Team:  86%|████████▌ | 6/7 [01:31<00:15, 15.

Input token count: 192,611
Output token count: 15,753
Max token length: 18,628
Cost: $1.20
Time: 8:30





In [7]:
ESM_PROMPT = "You are working on a project to develop antibodies for the SARS-CoV-2 spike protein, ideally for the newest variant of the virus and with broad spectrum activity across variants. You will use the ESM family of models for antibody design. Please design a method for applying ESM to this antibody design problem. Specify the exact model you will use and how you will use it to design antibodies."

In [8]:
ESM_QUESTIONS = (
    "Which ESM model will you use?"
    "Will you design the antibodies de novo or will you modify existing antibodies?",
    "If modifying existing antibodies, which precise antibody or antibodies will you modify?",
    "How exactly will you use your model to design antibodies?",
    "What is the precise process for designing antibodies and selecting candidates for experimental validation?",
    "How will you computationally evaluate the quality of the design antibodies?",
    "What objectives will you optimize for in the design process?",
)

In [9]:
# Run ESM project design
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=team_members,
        agenda=ESM_PROMPT,
        agenda_questions=ESM_QUESTIONS,
        contexts=contexts,
        save_dir=save_dir / "esm",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=num_rounds,
        model=model,
    )

Project Iterations:   0%|          | 0/3 [00:00<?, ?it/s]
Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:12<01:17, 12.88s/it][A[A

Team:  29%|██▊       | 2/7 [00:30<01:18, 15.66s/it][A[A

Team:  43%|████▎     | 3/7 [00:41<00:54, 13.75s/it][A[A

Team:  57%|█████▋    | 4/7 [00:56<00:42, 14.14s/it][A[A

Team:  71%|███████▏  | 5/7 [01:12<00:29, 14.90s/it][A[A

Team:  86%|████████▌ | 6/7 [01:34<00:17, 17.12s/it][A[A

Team: 100%|██████████| 7/7 [01:47<00:00, 15.39s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:47<05:23, 107.70s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:16<01:41, 16.94s/it][A[A

Team:  29%|██▊       | 2/7 [00:29<01:10, 14.17s/it][A[A

Team:  43%|████▎     | 3/7 [00:41<00:53, 13.39s/it][A[A

Team:  57%|█████▋    | 4/7 [00:56<00:42, 14.04s/it][A[A

Team:  71%|███████▏  | 5/7 [01:07<00:25, 12.79

Input token count: 189,867
Output token count: 15,080
Max token length: 17,950
Cost: $1.18
Time: 6:21



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:22<02:17, 22.90s/it][A[A

Team:  29%|██▊       | 2/7 [00:35<01:23, 16.61s/it][A[A

Team:  43%|████▎     | 3/7 [00:49<01:03, 15.76s/it][A[A

Team:  57%|█████▋    | 4/7 [01:10<00:53, 17.69s/it][A[A

Team:  71%|███████▏  | 5/7 [01:21<00:30, 15.38s/it][A[A

Team:  86%|████████▌ | 6/7 [01:36<00:15, 15.02s/it][A[A

Team: 100%|██████████| 7/7 [01:52<00:00, 16.04s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:52<05:36, 112.30s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:19<01:54, 19.04s/it][A[A

Team:  29%|██▊       | 2/7 [00:27<01:04, 12.96s/it][A[A

Team:  43%|████▎     | 3/7 [00:35<00:42, 10.50s/it][A[A

Team:  57%|█████▋    | 4/7 [00:47<00:33, 11.12s/it][A[A

Team:  71%|███████▏  | 5/7 [00:57<00:21, 10.92s/it][A[A

Team:  86%|████████▌ | 6/7 [01:05<00:09,  9.

Input token count: 182,640
Output token count: 14,052
Max token length: 16,922
Cost: $1.12
Time: 5:17



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:11<01:08, 11.34s/it][A[A

Team:  29%|██▊       | 2/7 [00:26<01:07, 13.54s/it][A[A

Team:  43%|████▎     | 3/7 [00:40<00:55, 13.85s/it][A[A

Team:  57%|█████▋    | 4/7 [00:57<00:45, 15.01s/it][A[A

Team:  71%|███████▏  | 5/7 [01:09<00:28, 14.03s/it][A[A

Team:  86%|████████▌ | 6/7 [01:23<00:13, 13.96s/it][A[A

Team: 100%|██████████| 7/7 [01:39<00:00, 14.19s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [01:39<04:57, 99.31s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:21<02:07, 21.30s/it][A[A

Team:  29%|██▊       | 2/7 [00:28<01:06, 13.30s/it][A[A

Team:  43%|████▎     | 3/7 [00:44<00:56, 14.23s/it][A[A

Team:  57%|█████▋    | 4/7 [00:57<00:41, 13.99s/it][A[A

Team:  71%|███████▏  | 5/7 [01:09<00:25, 12.97s/it][A[A

Team:  86%|████████▌ | 6/7 [01:23<00:13, 13.5

Input token count: 193,402
Output token count: 15,651
Max token length: 18,521
Cost: $1.20
Time: 5:51





In [9]:
# Select preferred summary
discussion_paths.append(save_dir / "esm" / "discussion_2.json")

In [10]:
# Load summaries
summaries = load_summaries(discussion_paths=discussion_paths)

In [11]:
ESM_IMPLEMENT_PROMPT = "You now need to implement an ESM-based approach for antibody design. Please write code to implement the method you designed in the previous discussion. Your code must be self-contained (with appropriate imports), and you must explain how the code works."

In [18]:
# Run ESM implement
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_individual_meeting(
        team_member=DATA_SCIENTIST,
        agenda=ESM_IMPLEMENT_PROMPT,
        summaries=summaries,
        contexts=contexts,
        save_dir=save_dir / "esm_implement",
        save_name=f"discussion_{iteration_num + 1}",
        model=model,
    )

Project Iterations:  33%|███▎      | 1/3 [00:33<01:07, 33.87s/it]

Input token count: 2,520
Output token count: 1,487
Max token length: 4,007
Cost: $0.03
Time: 0:33


Project Iterations:  67%|██████▋   | 2/3 [01:01<00:30, 30.32s/it]

Input token count: 2,520
Output token count: 1,265
Max token length: 3,785
Cost: $0.03
Time: 0:27


Project Iterations: 100%|██████████| 3/3 [01:52<00:00, 37.64s/it]

Input token count: 2,520
Output token count: 1,946
Max token length: 4,466
Cost: $0.04
Time: 0:51





In [12]:
ESM_REIMPLEMENT_PROMPT = "Above is the full text from a recent scientifc paper describing the use of ESM, a pre-trained protein language model, to design antibodies. Please read the paper and then describe in detail how to reproduce the results in the paper and what steps you would take next to build upon this work."

In [13]:
ESM_REIMPLEMENT_QUESTIONS = (
    "How would you reproduce the results in the paper? Please provide a detailed, step-by-step set of instructions.",
    "How would you reimplement the computational parts of the paper? Please provide self-contained Python code that implements the main experiments from the paper.",
    "What lab experiments would you run to validate the computationally designed antibodies? Please provide a step-by-step lab protocol.",
    "How would you extend beyond this paper to futher advance the science of protein language models for antibody design?",
)

In [14]:
# Run ESM reimplement
for iteration_num in trange(num_iterations, desc="Project Iterations"):
    run_scientific_meeting(
        team_lead=team_lead,
        team_members=team_members,
        agenda=ESM_REIMPLEMENT_PROMPT,
        agenda_questions=ESM_REIMPLEMENT_QUESTIONS,
        contexts=contexts + (ESM_ANTIBODIES_PAPER,),
        save_dir=save_dir / "esm_reimplement",
        save_name=f"discussion_{iteration_num + 1}",
        num_rounds=num_rounds,
        model=model,
    )

Project Iterations:   0%|          | 0/3 [00:00<?, ?it/s]
Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:15<01:32, 15.49s/it][A[A

Team:  29%|██▊       | 2/7 [00:39<01:43, 20.75s/it][A[A

Team:  43%|████▎     | 3/7 [00:57<01:16, 19.21s/it][A[A

Team:  57%|█████▋    | 4/7 [01:14<00:55, 18.38s/it][A[A

Team:  71%|███████▏  | 5/7 [02:01<00:57, 28.78s/it][A[A

Team:  86%|████████▌ | 6/7 [02:28<00:28, 28.01s/it][A[A

Team: 100%|██████████| 7/7 [02:45<00:00, 23.60s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [02:45<08:15, 165.21s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [01:20<08:01, 80.26s/it][A[A

Team:  29%|██▊       | 2/7 [01:43<03:54, 46.94s/it][A[A

Team:  43%|████▎     | 3/7 [02:19<02:46, 41.73s/it][A[A

Team:  57%|█████▋    | 4/7 [02:45<01:46, 35.64s/it][A[A

Team:  71%|███████▏  | 5/7 [03:10<01:03, 31.63

Input token count: 802,592
Output token count: 19,064
Max token length: 48,126
Cost: $4.30
Time: 11:15



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:15<01:33, 15.55s/it][A[A

Team:  29%|██▊       | 2/7 [00:49<02:13, 26.67s/it][A[A

Team:  43%|████▎     | 3/7 [01:04<01:25, 21.33s/it][A[A

Team:  57%|█████▋    | 4/7 [01:25<01:03, 21.00s/it][A[A

Team:  71%|███████▏  | 5/7 [01:42<00:39, 19.74s/it][A[A

Team:  86%|████████▌ | 6/7 [02:07<00:21, 21.29s/it][A[A

Team: 100%|██████████| 7/7 [02:31<00:00, 21.65s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [02:31<07:34, 151.56s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:43<04:22, 43.69s/it][A[A

Team:  29%|██▊       | 2/7 [00:58<02:14, 26.87s/it][A[A

Team:  43%|████▎     | 3/7 [01:25<01:48, 27.00s/it][A[A

Team:  57%|█████▋    | 4/7 [01:54<01:23, 27.80s/it][A[A

Team:  71%|███████▏  | 5/7 [02:28<00:59, 29.94s/it][A[A

Team:  86%|████████▌ | 6/7 [02:52<00:27, 27.

Input token count: 782,601
Output token count: 17,211
Max token length: 46,273
Cost: $4.17
Time: 9:52



Rounds (+ Summary Round):   0%|          | 0/4 [00:00<?, ?it/s][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:22<02:12, 22.13s/it][A[A

Team:  29%|██▊       | 2/7 [00:42<01:46, 21.21s/it][A[A

Team:  43%|████▎     | 3/7 [01:45<02:40, 40.17s/it][A[A

Team:  57%|█████▋    | 4/7 [02:05<01:36, 32.26s/it][A[A

Team:  71%|███████▏  | 5/7 [02:22<00:53, 26.66s/it][A[A

Team:  86%|████████▌ | 6/7 [02:41<00:24, 24.19s/it][A[A

Team: 100%|██████████| 7/7 [03:02<00:00, 26.10s/it][A[A

Rounds (+ Summary Round):  25%|██▌       | 1/4 [03:02<09:08, 182.71s/it][A

Team:   0%|          | 0/7 [00:00<?, ?it/s][A[A

Team:  14%|█▍        | 1/7 [00:28<02:48, 28.01s/it][A[A

Team:  29%|██▊       | 2/7 [00:49<01:59, 23.97s/it][A[A

Team:  43%|████▎     | 3/7 [01:10<01:30, 22.67s/it][A[A

Team:  57%|█████▋    | 4/7 [01:29<01:04, 21.48s/it][A[A

Team:  71%|███████▏  | 5/7 [01:50<00:42, 21.28s/it][A[A

Team:  86%|████████▌ | 6/7 [02:12<00:21, 21.

Input token count: 786,830
Output token count: 16,959
Max token length: 46,021
Cost: $4.19
Time: 8:57



