In [None]:
import concurrent.futures

from virtual_lab.constants import CONSISTENT_TEMPERATURE, CREATIVE_TEMPERATURE
from virtual_lab.prompts import (
    REWRITE_PROMPT,
    create_merge_prompt,
)
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import load_summaries

from nanobody_constants import (
    background_prompt,
    nanobody_prompt,
    experimental_results_prompt,
    num_iterations,
    num_rounds,
    discussions_phase_to_dir,
    principal_investigator,
    team_members,
    machine_learning_specialist,
    computational_biologist,
)

## Nanobody Improvement

In [None]:
# Nanobody improvement - prompts
nanobody_improvement_agenda = f"""{background_prompt} {nanobody_prompt} {experimental_results_prompt}

Based on these results, you must decide how to proceed to design further improved nanobodies for recent variants of SARS-CoV-2. You may either continue to pursue identifying binders to the KP.3 RBD or you may decide to target a different strain or strains of the SARS-CoV-2 spike RBD. Your team should select another 92 mutant nanobodies to test experimentally. This time, you do not have to test an equal number of mutants for each wild-type nanobody, and you may even entirely leave out some wild-type nanobodies. You can either start again from the wild-type nanobodies and introduce mutations or begin with one of the previously designed nanobodies. You may use ESM, AlphaFold-Multimer, and Rosetta in similar ways as your previous design process, or you can change the design workflow. As a reminder, ESM is antigen-agnostic and improves general nanobody quality while AlphaFold-Multimer and Rosetta are antigen-specific and focus on binding to the selected antigen. You should aim to design nanobodies that bind specifically to the RBD of recent variants of SARS-CoV-2 and do not bind non-specifically to BSA. You should provide a rationale for all of your decisions."""

nanobody_improvement_questions = (
    "Will you continue to target the KP.3 RBD or will you target a different strain or strains of the SARS-CoV-2 spike RBD?",
    "Which wild-type nanobody or nanobodies will you select for further improvement?",
    "How many mutant nanobodies will you design for each of those wild-type nanobodies (92 total)?",
    "Will you begin with the wild-type nanobody itself and introduce mutations, or will you begin with one of the previously designed mutant nanobodies?",
    "If you are beginning with previously designed mutant nanobodies, which one or ones will you start with?",
    "Will you continue to use the same general ESM, AlphaFold-Multimer, and Rosetta design pipeline you previously used, or will you design a new computational pipeline?",
)

nanobody_improvement_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["workflow_design"] / "merged.json"])
print(f"Number of prior summaries: {len(nanobody_improvement_prior_summaries)}")

In [None]:
# Nanobody improvement - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            summaries=nanobody_improvement_prior_summaries,
            agenda=nanobody_improvement_agenda,
            agenda_questions=nanobody_improvement_questions,
            save_dir=discussions_phase_to_dir["nanobody_improvement"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Nanobody improvement - merge
nanobody_improvement_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["nanobody_improvement"].glob("discussion_*.json")))
print(f"Number of summaries: {len(nanobody_improvement_summaries)}")

nanobody_improvement_merge_prompt = create_merge_prompt(
    agenda=nanobody_improvement_agenda,
    agenda_questions=nanobody_improvement_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=nanobody_improvement_summaries,
    agenda=nanobody_improvement_merge_prompt,
    save_dir=discussions_phase_to_dir["nanobody_improvement"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

## Updated Workflow

In [None]:
# Updated workflow - prompts
updated_workflow_agenda = f"""{background_prompt} {nanobody_prompt} {experimental_results_prompt}

Your team has decided to design further improved nanobodies for recent variants of SARS-CoV-2 as described in the summary using the same general ESM, AlphaFold-Multimer, and Rosetta computational design pipeline. ESM will again be used to evaluate the overall quality of mutated nanobodies while AlphaFold-Multimer and Rosetta will be used to determine their specific binding to your selected SARS-CoV-2 variants. Now, you need to specify more details for this updated design workflow."""

updated_workflow_questions = (
    "When using ESM to evaluate all single point mutations to an input nanobody sequence, how many of the top ranked mutations by ESM LLR will you keep for analysis by AlphaFold-Multimer and Rosetta out of the ~2,000 possible mutations?",
    "If this number differs from your previous design process, what is the rationale for the change?",
    "After evaluating those mutated nanobodies with ESM, AlphaFold-Multimer, and Rosetta, what formula will you use to compute a weighted score (WS) for each mutated nanobody, and how will this factor in binding to both the JN.1 and KP.3 RBDs for the AlphaFold-Multimer and Rosetta portions of the score?",
    "If this WS formula differs from your previous design process, what is the rationale for the change?",
    "After computing the WS for each mutated nanobody, how many of the top ranked mutated nanobodies will you select for the next round of mutation?",
    "If this number differs from your previous design process, what is the rationale for the change?",
    "How many rounds of mutation will you run in total starting with the Nb21 and Ty1 mutants?",
    "If this number differs from your previous design process, what is the rationale for the change?",
)

updated_workflow_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["workflow_design"] / "merged.json",
                      discussions_phase_to_dir["nanobody_improvement"] / "merged.json"])
print(f"Number of prior summaries: {len(updated_workflow_prior_summaries)}")

In [None]:
# Updated workflow - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            summaries=updated_workflow_prior_summaries,
            agenda=updated_workflow_agenda,
            agenda_questions=updated_workflow_questions,
            save_dir=discussions_phase_to_dir["updated_workflow"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Updated workflow - merge
updated_workflow_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["updated_workflow"].glob("discussion_*.json")))
print(f"Number of summaries: {len(updated_workflow_summaries)}")

updated_workflow_merge_prompt = create_merge_prompt(
    agenda=updated_workflow_agenda,
    agenda_questions=updated_workflow_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=updated_workflow_summaries,
    agenda=updated_workflow_merge_prompt,
    save_dir=discussions_phase_to_dir["updated_workflow"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

## Workflow Questions

In [None]:
# Workflow questions - prompts
workflow_questions_agenda = f"""{background_prompt} {nanobody_prompt} You created a nanobody design workflow that employs ESM, AlphaFold-Multimer, and Rosetta to evaluate mutated nanobody candidates for binding to recent variants of the SARS-CoV-2 spike protein receptor binding domain (RBD). In particular, AlphaFold-Multimer is run with the mutated nanobody sequence and RBD sequence to predict the structure of the nanobody-RBD complex. By default, AlphaFold-Multimer generates five structural models and ranks them by confidence. Your workflow takes the highest confidence model as the predicted structure and computes the interface pLDDT (ipLDDT) of that structure. Then, Rosetta is applied to relax that best predicted complex structure and compute a binding interface energy (dG_separated) using the REF2015 scoring function.

It turns out that AlphaFold-Multimer and Rosetta are not deterministic. When given the same nanobody and RBD sequences twice, AlphaFold-Multimer (run using LocalColabFold) computes the same MSA but generates a different set of five structures each time. This can result in dramatically different ipLDDT computations from the top ranked complex structures of two different runs (e.g., ipLDDT = 86.1 vs ipLDDT = 72.6). Similarly, when Rosetta is given the same nanobody-RBD complex structure twice, it computes two different binding energies (e.g., dG_separated = -32.2 vs dG_separated = -31.8). This variability can lead to different rankings of mutated nanobody candidates and different decisions on which candidates to select for experimental validation.

First, please explain why AlphaFold-Multimer and Rosetta are not deterministic and how this non-determinism affects the ipLDDT and dG_separated scores. Second, propose a strategy to address this non-determinism in your nanobody design workflow. Your strategy should aim to reduce the variability in the ipLDDT and dG_separated scores and improve the consistency of the rankings of mutated nanobody candidates without increasing the computational cost."""

workflow_questions_questions = (
    "Why is AlphaFold-Multimer non-deterministic in this workflow, and how does this non-determinism affect the ipLDDT score?",
    "Why is Rosetta non-deterministic in this workflow, and how does this non-determinism affect the dG_separated score?",
    "How will you change the AlphaFold-Multimer component of the workflow (if at all) to improve ipLDDT score consistency while still only using one AlphaFold-Multimer run (five models)?",
    "In your new workflow, what AlphaFold-Multimer complex structure will you use as input to Rosetta?",
    "How will you change the Rosetta component of the workflow (if at all) to improve dG_separated score consistency while still only using one Rosetta run on one complex structure?",
)

workflow_questions_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["workflow_design"] / "merged.json",
                      discussions_phase_to_dir["nanobody_improvement"] / "merged.json",
                      discussions_phase_to_dir["updated_workflow"] / "merged.json"])
print(f"Number of prior summaries: {len(workflow_questions_prior_summaries)}")

In [None]:
# Workflow questions - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=machine_learning_specialist,
            summaries=workflow_questions_prior_summaries,
            agenda=workflow_questions_agenda,
            agenda_questions=workflow_questions_questions,
            save_dir=discussions_phase_to_dir["workflow_questions"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Workflow questions - merge
workflow_questions_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["workflow_questions"].glob("discussion_*.json")))
print(f"Number of summaries: {len(workflow_questions_summaries)}")

workflow_questions_merge_prompt = create_merge_prompt(
    agenda=workflow_questions_agenda,
    agenda_questions=workflow_questions_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=machine_learning_specialist,
    summaries=workflow_questions_summaries,
    agenda=workflow_questions_merge_prompt,
    save_dir=discussions_phase_to_dir["workflow_questions"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

## Modify AlphaFold-Multimer

In [None]:
# Modify AlphaFold-Multimer - prompts
modify_alphafold_agenda = f"""You previously wrote a Python script that processes the outputs of AlphaFold-Multimer to calculate the confidence of nanobody-antigen complexes (see summary). {REWRITE_PROMPT}

1. For each subdirectory of the "directory" variable, instead of just loading the one file that matches the pattern *unrelaxed_rank_001*.pdb, load all PDB files in that directory (there should be five). Then, compute the average ipLDDT, residue count, and atom count across those PDB files and save those average values to a CSV.
2. For each of those subdirectories, identify the PDB file with the median ipLDDT within that directory. Create an absolute symlink with the name "median_iplddt.pdb" within that subdirectory linking to that median ipLDDT PDB file."""

In [None]:
# Modify AlphaFold-Multimer - discussion
modify_alphafold_summaries = load_summaries(discussion_paths=[discussions_phase_to_dir["alphafold"] / "improved.json"])
print(f"Number of summaries: {len(modify_alphafold_summaries)}")

run_meeting(
    meeting_type="individual",
    team_member=computational_biologist,
    summaries=modify_alphafold_summaries,
    agenda=modify_alphafold_agenda,
    save_dir=discussions_phase_to_dir["alphafold_modified"],
    save_name="modified",
    temperature=CONSISTENT_TEMPERATURE,
)

## Modify Rosetta

In [None]:
# Modify Rosetta - prompts
modify_rosetta_agenda = f"""You previously wrote a RosettaScripts XML file to calculate the binding affinity of a nanobody-antigen complex and a Python script to aggregate multiple Rosetta binding energy score files into one CSV file (see summary). {REWRITE_PROMPT}

1. Correct the RosettaScripts XML file so that the nanobody is labeled as chain "A" and the antigen is labeled as chain "B".
2. Modify the Python script so that for each subdirectory of the "input_dir" variable, it loads all score files under that subdirectory by globbing for **/*.sc (there should be five). Then, compute the average dG_separated across those files and save those average values to a CSV."""

In [None]:
# Modify Rosetta - discussion
modify_rosetta_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["rosetta"] / "improved_xml.json",
                      discussions_phase_to_dir["rosetta"] / "improved_python.json"])
print(f"Number of summaries: {len(modify_rosetta_summaries)}")

run_meeting(
    meeting_type="individual",
    team_member=computational_biologist,
    summaries=modify_rosetta_summaries,
    agenda=modify_rosetta_agenda,
    save_dir=discussions_phase_to_dir["rosetta_modified"],
    save_name="modified",
    temperature=CONSISTENT_TEMPERATURE,
)