In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import openai
from openai import OpenAI

import concurrent.futures
import json
from pathlib import Path
import os

from virtual_lab.constants import CONSISTENT_TEMPERATURE, CREATIVE_TEMPERATURE
from virtual_lab.prompts import (
    CODING_RULES,
    REWRITE_PROMPT,
    create_merge_prompt
)
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import load_summaries

from transcriptomics_constants import (
    background_prompt,
    experimental_results_prompt,
    num_iterations,
    num_rounds,
    discussions_phase_to_dir,
    principal_investigator,
    team_members,
    scientific_critic,
    statistician,
    parasitologist,
    computational_biologist,
    software_developer
)

In [2]:

# testovací buňka moje

print("API klíč načten:", os.getenv("OPENAI_API_KEY") is not None)

api_key = os.getenv("OPENAI_API_KEY")
print("Začátek klíče:", api_key[:8], "... konec:", api_key[-4:])

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print(client.models.list())

print(openai.__version__)

API klíč načten: True
Začátek klíče: sk-proj- ... konec: dowA
SyncPage[Model](data=[Model(id='gpt-4-0613', created=1686588896, object='model', owned_by='openai'), Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'), Model(id='gpt-3.5-turbo', created=1677610602, object='model', owned_by='openai'), Model(id='gpt-5-search-api-2025-10-14', created=1760043960, object='model', owned_by='system'), Model(id='gpt-realtime-mini', created=1759517133, object='model', owned_by='system'), Model(id='gpt-realtime-mini-2025-10-06', created=1759517175, object='model', owned_by='system'), Model(id='sora-2', created=1759708615, object='model', owned_by='system'), Model(id='sora-2-pro', created=1759708663, object='model', owned_by='system'), Model(id='davinci-002', created=1692634301, object='model', owned_by='system'), Model(id='babbage-002', created=1692634615, object='model', owned_by='system'), Model(id='gpt-3.5-turbo-instruct', created=1692901427, object='model', owned_by='system'

## Team selection

In [None]:
# Team selection - prompts, pozor, když si jednou vyberu tým, už to znova nespouštím a nechám si ty konverzace v diskuzi
team_selection_agenda = f"""
{background_prompt}

{experimental_results_prompt}

You need to select a team of four scientists to help you with this transcriptomics project. The team should deal with these analytical challenges:

- Multi-factorial statistical modeling to separate resistance from confounding factors
- RNA-seq analysis and differential expression (DESeq2, edgeR)
- Biological interpretation in the context of Giardia intestinalis biology and protozoan drug resistance
- Implementation (R/Bioconductor, Python)
- Functional annotation and characterization of putative/hypothetical proteins

NOTE: Giardia intestinalis is a unique protozoan parasite with unusual biology. Understanding gene expression changes requires expertise in parasite physiology and drug resistance mechanisms.

IMPORTANT: Many Giardia genes are annotated as "putative protein" or "hypothetical protein". The team needs expertise in:
- Protein function prediction (sequence homology, domain analysis, structural prediction)
- Comparative genomics to infer function from related organisms
- Literature mining and database searches to assign putative functions

Please select the team members in the following format. You should NOT include yourself (Principal Investigator) in the list. Write the team as a Python list of Agent objects with "model=model" as the last parameter.

Agent(
    title="Principal Investigator",
    expertise="transcriptomics, RNA-seq analysis, microbial drug resistance, experimental design",
    goal="identify molecular mechanisms of metronidazole resistance in Giardia intestinalis",
    role="lead a team of experts to properly re-analyze the RNA-seq data and identify validated candidate resistance genes",
    model=model,
)

Principal Investigator, please provide your response.
"""


In [None]:
# Team selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            agenda=team_selection_agenda,
            save_dir=discussions_phase_to_dir["team_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Team selection - merge
team_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["team_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(team_selection_summaries)}")

team_selection_merge_prompt = create_merge_prompt(agenda=team_selection_agenda)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=team_selection_summaries,
    agenda=team_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["team_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

## Projects specification

In [5]:
project_specification_agenda = f"""

{background_prompt}

{experimental_results_prompt} 
Design a transcriptomic analysis plan to identify genes specifically linked to metronidazole resistance in the BER line of Giardia intestinalis. Clearly distinguish resistance-specific effects from general drug responses and baseline strain differences. Propose a statistical model (e.g. with interaction terms) to detect these effects. Prioritize candidate resistance genes for validation and link them to biological functions using functional annotation tools. Include an approach for analyzing uncharacterized (putative) proteins."""

project_specification_questions = (
    "What is the most effective approach to identify genes linked to metronidazole resistance in *Giardia intestinalis*?",
    "How can resistance-specific expression be separated from general drug response and baseline differences between strains?",
    "Is a simple comparison sufficient, or is a complex statistical model needed? Why?",
    "How should candidate genes be functionally annotated and connected to biological pathways?",
    "What strategy can identify and characterize putative (unannotated) proteins among the candidate genes?",
    "Are any additional files, metadata, or annotations needed to perform the analysis effectively?",
)


In [6]:

# Project specification - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            agenda=project_specification_agenda,
            agenda_questions=project_specification_questions,
            save_dir=discussions_phase_to_dir["project_specification"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|          | 0/4 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A



[A[A[A[A

[A[A







[A[A[A[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A

[A[A







[A[A[A[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A

[A[A







[A[A[A[A[A[A[A[A



[A[A[A[A







[A[A[A[A[A[A[A[A





[A[A[A[A[A[A

Team:  67%|██████▋   | 4/6 [01:17<00:38, 19.35s/it]
Rounds (+ Final Round):   0%|          | 0/4 [01:17<?, ?it/s]
Team:  83%|████████▎ | 5/6 [01:17<00:15, 15.45s/it]
Rounds (+ Final Round):   0%|          | 0/4 [01:17<?, ?it/s]






[A[A[A[A[A[A



[A[A[A[A

[A[A



Team: 100%|██████████| 6/6 [01:41<00:00, 16.88s/it]



Team:   0%|          | 0/6 [00:00<?, ?it/s]





Team:  83%|████████▎ | 5/6 [01:46<00:21, 21.33s/it]
Rounds (+ Final Round):   0%|        

In [None]:
# Project specification - merge
project_specification_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["project_specification"].glob("discussion_*.json")))
print(f"Number of summaries: {len(project_specification_summaries)}")

project_specification_merge_prompt = create_merge_prompt(
    agenda=project_specification_agenda,
    agenda_questions=project_specification_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=project_specification_summaries,
    agenda=project_specification_merge_prompt,
    save_dir=discussions_phase_to_dir["project_specification"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

## Tools Selection

In [None]:
# Tools selection - prompts
tools_selection_agenda = f"""
{background_prompt}

{experimental_results_prompt}

Based on the project specification discussion, select the specific computational and bioinformatics tools needed to implement the transcriptomics analysis plan.

IMPORTANT: Consider both established tools AND recent innovations in the field. The analysis should leverage:
- Modern statistical approaches for multi-factorial RNA-seq analysis
- State-of-the-art methods for protein function prediction
- Recent advances in AI/ML for biological sequence analysis
- Novel approaches for analyzing non-model organisms like Giardia

Please list and justify your tool choices for:
- Statistical analysis and differential expression
- Functional annotation and pathway enrichment
- Protein function prediction for putative/hypothetical proteins (consider recent advances in structure and function prediction)
- Data processing and visualization
- Giardia-specific or protozoan-specific resources

For each tool, explain:
1. How it will be used in the workflow
2. Why it's appropriate for this project
3. Whether there are newer alternatives worth considering

Feel free to suggest cutting-edge tools or innovative approaches that could enhance the analysis beyond traditional methods.
"""

tools_selection_questions = (
    "What computational tools should be used for the RNA-seq analysis (include both established and innovative tools)?",
    "For each tool, how will it be specifically applied to identify metronidazole resistance mechanisms?",
    "Which pathway databases and functional annotation resources are most appropriate for Giardia?",
    "What modern approaches (including AI-powered tools) should be used for characterizing putative proteins?",
    "Are there any recent advances in protozoan genomics tools that could benefit this analysis?",
)

tools_selection_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["project_specification"] / "merged.json"])
print(f"Number of prior summaries: {len(tools_selection_prior_summaries)}")


In [None]:
# Tools selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            summaries=tools_selection_prior_summaries,
            agenda=tools_selection_agenda,
            agenda_questions=tools_selection_questions,
            save_dir=discussions_phase_to_dir["tools_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Tools selection - merge
tools_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["tools_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(tools_selection_summaries)}")

tools_selection_merge_prompt = create_merge_prompt(
    agenda=tools_selection_agenda,
    agenda_questions=tools_selection_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=tools_selection_summaries,
    agenda=tools_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["tools_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

## Implementation

In [None]:
# Implementation agent selection - prompts
implementation_agent_selection_agenda = f"""
{background_prompt}

{experimental_results_prompt}

Based on the project specification and selected tools, your team needs to implement the transcriptomics analysis pipeline.

The analysis requires implementation of several major components:

1. Statistical analysis pipeline (DESeq2 multi-factorial model with interaction terms)
2. Functional annotation pipeline (ClusterProfiler, ReactomePA, pathway enrichment)
3. Putative protein characterization (AlphaFold, RoseTTAFold, domain analysis)
4. Data visualization and reporting (ggplot2, Plotly, comprehensive plots)

Please discuss and select which team member(s) will implement each component. Team members may implement multiple components, and components may be implemented collaboratively.

Consider each team member's expertise when making assignments.
"""

implementation_agent_selection_questions = (
    "Which team member(s) will implement the statistical analysis pipeline (DESeq2, multi-factorial model)?",
    "Which team member(s) will implement the functional annotation pipeline (ClusterProfiler, ReactomePA)?",
    "Which team member(s) will implement the putative protein characterization component (AlphaFold, domain analysis)?",
    "Which team member(s) will implement data visualization and reporting?",
)

implementation_agent_selection_prior_summaries = load_summaries(
    discussion_paths=[
        discussions_phase_to_dir["team_selection"] / "merged.json",
        discussions_phase_to_dir["project_specification"] / "merged.json",
        discussions_phase_to_dir["tools_selection"] / "merged.json"
    ]
)
print(f"Number of prior summaries: {len(implementation_agent_selection_prior_summaries)}")


In [None]:
# Implementation - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            summaries=implementation_agent_selection_prior_summaries,
            agenda=implementation_agent_selection_agenda,
            agenda_questions=implementation_agent_selection_questions,
            save_dir=discussions_phase_to_dir["implementation_agent_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Implementation - merge
implementation_agent_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["implementation_agent_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(implementation_agent_selection_summaries)}")

implementation_agent_selection_merge_prompt = create_merge_prompt(
    agenda=implementation_agent_selection_agenda,
    agenda_questions=implementation_agent_selection_questions
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=implementation_agent_selection_summaries,
    agenda=implementation_agent_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["implementation_agent_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

## Workflow Design

In [None]:
# Workflow design - prompts
workflow_design_agenda = f"""
{background_prompt}

{experimental_results_prompt}

Based on the project specification, selected tools, and implementation assignments, design a detailed step-by-step workflow for the transcriptomics analysis.

The workflow should cover:
1. Data preparation and quality control
2. Statistical analysis (DESeq2 multi-factorial model)
3. Functional annotation (pathway enrichment)
4. Putative protein characterization
5. Visualization and reporting

Provide a clear, modular, and reproducible workflow with inputs, outputs, and quality checks for each step.
"""

workflow_design_questions = (
    "What is the complete step-by-step workflow for the analysis?",
    "What are the inputs and outputs for each major step?",
    "What quality control checks should be performed at each stage?",
    "How will the different analysis components integrate together?",
    "What are the key decision points and how should they be handled?",
)

workflow_design_prior_summaries = load_summaries(
    discussion_paths=[
        discussions_phase_to_dir["team_selection"] / "merged.json",
        discussions_phase_to_dir["project_specification"] / "merged.json",
        discussions_phase_to_dir["tools_selection"] / "merged.json",
        discussions_phase_to_dir["implementation_agent_selection"] / "merged.json"
    ]
)
print(f"Number of prior summaries: {len(workflow_design_prior_summaries)}")



In [None]:
# Workflow design - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            agenda=workflow_design_agenda,
            agenda_questions=workflow_design_questions,
            save_dir=discussions_phase_to_dir["workflow_design"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

In [None]:
# Workflow design - merge
workflow_design_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["workflow_design"].glob("discussion_*.json")))
print(f"Number of summaries: {len(workflow_design_summaries)}")

workflow_design_merge_prompt = create_merge_prompt(
    agenda=workflow_design_agenda,
    agenda_questions=workflow_design_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=workflow_design_summaries,
    agenda=workflow_design_merge_prompt,
    save_dir=discussions_phase_to_dir["workflow_design"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

## Writting Scripts

In [None]:
# ========================================
# IMPLEMENTATION PHASES
# ========================================

# DESeq2 Statistical Analysis - prompts
deseq2_analysis_agenda = f"""
{background_prompt}

{experimental_results_prompt}

You are the Bioinformatics Statistician. Based on the workflow design, write a COMPLETE R script that implements the statistical analysis pipeline.

The script must:
1. Load count matrix (experimental_data/A2_count_matrix.txt) and sample metadata
2. Create DESeq2 object with multi-factorial design: ~ genotype + treatment + genotype:treatment
3. Filter low-count genes (≥10 counts in ≥3 samples)
4. Run DESeq2 normalization and analysis
5. Define contrasts to isolate resistance-specific effects
6. Extract significant genes (FDR < 0.05, |log2FC| > 1.5)
7. Perform diagnostic checks
8. Export results to CSV files
9. Include clear comments

Write the complete R script now.
"""

deseq2_analysis_prior_summaries = load_summaries(
    discussion_paths=[
        discussions_phase_to_dir["workflow_design"] / "merged.json",
        discussions_phase_to_dir["tools_selection"] / "merged.json"
    ]
)
print(f"Number of prior summaries: {len(deseq2_analysis_prior_summaries)}")


In [None]:
# DESeq2 Statistical Analysis - implementation
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=team_members[1],  # Bioinformatics Statistician
            summaries=deseq2_analysis_prior_summaries,
            agenda=deseq2_analysis_agenda,
            save_dir=discussions_phase_to_dir["deseq2_analysis"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CONSISTENT_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])


In [None]:
# DESeq2 Statistical Analysis - merge
deseq2_analysis_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["deseq2_analysis"].glob("discussion_*.json"))
)
print(f"Number of summaries: {len(deseq2_analysis_summaries)}")

deseq2_analysis_merge_prompt = create_merge_prompt(agenda=deseq2_analysis_agenda)

run_meeting(
    meeting_type="individual",
    team_member=team_members[1],  # Bioinformatics Statistician
    summaries=deseq2_analysis_summaries,
    agenda=deseq2_analysis_merge_prompt,
    save_dir=discussions_phase_to_dir["deseq2_analysis"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)


## Virtual Lab Analysis

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.rcParams.update({'font.size': 26})

In [None]:
figure_dir = Path("figures/virtual_lab_analysis")
figure_dir.mkdir(parents=True, exist_ok=True)

phase_to_agent_to_word_count = {}

In [None]:
# Count words that the human user wrote
phase_to_human_words = {
    "team_selection": [
        background_prompt,
        principal_investigator.prompt,
        scientific_critic.prompt,
        team_selection_agenda.replace(f"{background_prompt} ", ""),
    ],
    "project_specification": [
        project_specification_agenda.replace(f"{background_prompt} ", ""),
        *project_specification_questions,
        nanobody_prompt,
    ],
    "tools_selection": [
        tools_selection_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        *tools_selection_questions,
    ],
    "implementation_agent_selection": [
        implementation_agent_selection_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        *implementation_agent_selection_questions,
    ],
    "esm": [
        esm_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        improve_esm_agenda.replace(f" {REWRITE_PROMPT}", ""),
    ],
    "alphafold": [
        alphafold_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        improve_alphafold_agenda.replace(f" {REWRITE_PROMPT}", ""),
    ],
    "rosetta": [
        rosetta_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        improve_rosetta_xml_agenda.replace(f" {REWRITE_PROMPT}", ""),
        improve_rosetta_python_agenda.replace(f" {REWRITE_PROMPT}", ""),
    ],
    "workflow_design": [
        workflow_design_agenda.replace(f"{background_prompt} {nanobody_prompt} ", ""),
        *workflow_design_questions,
    ],
}

for phase, human_words in phase_to_human_words.items():
    phase_to_agent_to_word_count[phase] = {"Human Researcher": len(" ".join(human_words).split())}

In [None]:
# Count words that the LLM agents wrote
for phase_name in ["team_selection", "project_specification", "tools_selection",
                   "implementation_agent_selection", "esm", "alphafold", "rosetta", "workflow_design"]:
    phase_dir = discussions_phase_to_dir[phase_name]

    print(f"Phase: {phase_name}")

    # Load the text written by each agent
    agent_to_text = {}
    for path in phase_dir.glob("*.json"):
        with open(path) as f:
            discussion = json.load(f)

        for message in discussion:
            agent_to_text.setdefault(message["agent"], []).append(message["message"])

    # Count the number of words written by each agent
    for agent, text in agent_to_text.items():
        if agent == "User":
            continue

        agent_to_text[agent] = " ".join(text)
        word_count = len(agent_to_text[agent].split())
        phase_to_agent_to_word_count[phase_name][agent] = word_count

# Print words by phase
for phase in phase_to_agent_to_word_count:
    print(f"Phase: {phase}")
    for agent, word_count in phase_to_agent_to_word_count[phase].items():
        print(f"Number of words written by {agent}: {word_count:,}")
    print()

# Sum word counts across phases
agent_to_word_count = {}
for phase in phase_to_agent_to_word_count:
    for agent, word_count in phase_to_agent_to_word_count[phase].items():
        agent_to_word_count[agent] = agent_to_word_count.get(agent, 0) + word_count

# Total number of words written by each LLM agent
for agent, word_count in agent_to_word_count.items():
    print(f"Total number of words written by {agent}: {word_count:,}")

print()

# Total number of words written by all LLM agents
total_human_words = sum(
    phase_to_agent_to_word_count[phase]["Human Researcher"] for phase in phase_to_agent_to_word_count)
total_agent_words = sum(word_count for agent, word_count in agent_to_word_count.items() if agent != "Human Researcher")

print(f"Total number of words written by Human Researcher: {total_human_words:,}")
print(f"Total number of words written by all LLM agents: {total_agent_words:,}")

In [None]:
agent_to_color = {
    agent: sns.color_palette("tab10", n_colors=len(agent_to_word_count))[i]
    for i, agent in enumerate(agent_to_word_count)
}

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.pie(
    agent_to_word_count.values(),
    labels=agent_to_word_count.keys(),
    autopct="%1.1f%%",
    colors=[agent_to_color[agent] for agent in agent_to_word_count],
)
ax.set_title(f"Words written")
plt.savefig(figure_dir / "total_words_written.pdf", bbox_inches="tight")

In [None]:
for phase in phase_to_agent_to_word_count:
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.pie(
        phase_to_agent_to_word_count[phase].values(),
        labels=phase_to_agent_to_word_count[phase].keys(),
        autopct="%1.1f%%",
        colors=[agent_to_color[agent] for agent in phase_to_agent_to_word_count[phase]],
    )
    ax.set_title(f"Words written in {phase.replace('_', ' ')}")
    plt.savefig(figure_dir / f"{phase}_words_written.pdf", bbox_inches="tight")