Purpose of this file is to save down relevant information from the W&B runs that constitute the data for the paper. These will become Supplementary Tables.

In [None]:
# Import Required Libraries
import logging
import os
import sys

import pandas as pd

import wandb

sys.path.insert(0, "../..")  # add project_config to path
import project_config

# Setup Logging and Configuration
logging.basicConfig(
    format="%(asctime)s %(levelname)-8s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)
logger = logging.getLogger(__name__)

In [None]:
# Define WandB project and sweep details
PROJECT_NAME = "millergw/prostate_met_status"

# # Define directories for saving results
RESULTS_DIR = project_config.SUPPLEMENTARY_TABLES_DIR

joint_simu_result_savepath = os.path.join(RESULTS_DIR, "joint_simulation_prediction_metrics_per_run.csv")
single_gene_simu_result_savepath = os.path.join(RESULTS_DIR, "single_gene_spike_in_simulation_prediction_metrics_per_run.csv")
p1000_result_savepath = os.path.join(RESULTS_DIR, "p1000_empirical_prediction_metrics_per_run.csv")
# Define paths for grouped summary results
grouped_joint_simu_result_savepath = os.path.join(RESULTS_DIR, "joint_simulation_prediction_metrics_per_group.csv")
grouped_single_gene_simu_result_savepath = os.path.join(RESULTS_DIR, "single_gene_spike_in_simulation_prediction_metrics_per_group.csv")
grouped_p1000_result_savepath = os.path.join(RESULTS_DIR, "p1000_empirical_prediction_metrics_per_group.csv")

# make directory if it doesn't exist
os.makedirs(os.path.join(RESULTS_DIR), exist_ok=True)

In [None]:
# add tags to final runs
# Initialize W&B API
api = wandb.Api() 
columns_to_retrieve = [
    "run_id", "run_name", "model_type", 
    "datasets",
    "sample_binary", "n_samples",  "sigma", "odds_ratio", "num_class1_samples", "num_class0_samples", "control_frequency",
    "deltaMuGenes", "mod0_genes", "mod1_genes",
    "save_dir",
]

performance_metric_columns_to_retrieve = [
    "train_average_precision_score", "validation_average_precision_score", "test_average_precision_score",
    "train_roc_auc_score", "validation_roc_auc_score", "test_roc_auc_score",
    "train_f1_score", "validation_f1_score", "test_f1_score", 
    "train_balanced_acc", "validation_balanced_acc", "test_balanced_acc", 
    "train_acc", "validation_acc", "test_acc",
    "train_confusion_matrix", "validation_confusion_matrix", "test_confusion_matrix",
]

columns_to_retrieve.extend(performance_metric_columns_to_retrieve)

logging.info(f"Retrieving data from {len(columns_to_retrieve)} columns: {columns_to_retrieve}")

performance_metric_col_order = [
    "train_avg_precision", "validation_avg_precision", "test_avg_precision",
    "train_roc_auc_score", "validation_roc_auc_score", "test_roc_auc_score",
    "train_f1_score", "validation_f1_score", "test_f1_score", 
    "train_balanced_acc", "validation_balanced_acc", "test_balanced_acc", 
    "train_acc", "validation_acc", "test_acc",
    # "train_confusion_matrix", "validation_confusion_matrix", "test_confusion_matrix",
]

In [None]:
def collapse_over_replicates(df, group_by_cols):
    # Average over seeds
    # Add a counts column for each group
    df_counts = df.groupby(group_by_cols).size().reset_index(name="count")
    df_avg = df.groupby(group_by_cols, as_index=False).mean()
    df_avg = df_avg.merge(df_counts, on=group_by_cols)
    return df_avg

def make_grouped_summary_with_mean_and_stdev(df, param_cols, metric_cols):
    # 1) Count per group
    counts = df.groupby(param_cols).size().reset_index(name="count")

    # 2) Mean & std per group
    summary = (
        df.groupby(param_cols)[metric_cols]
        .agg(['mean', 'std'])
    )

    # Flatten multi-index columns
    summary.columns = [f"{m}_{stat}" for m, stat in summary.columns]
    summary = summary.reset_index()

    # 3) Merge counts back in
    summary_numeric = pd.merge(counts, summary, on=param_cols)

    # 4) Formatted as "mean ± std"
    summary_formatted = summary_numeric.assign(**{
        m: summary_numeric[f"{m}_mean"].round(3).astype(str) 
        + " ± " + summary_numeric[f"{m}_std"].round(3).astype(str)
        for m in metric_cols
    })[param_cols + ["count"] + metric_cols]
    return summary_formatted

# Joint sampling

In [None]:
all_records = []
runs = api.runs(f"{PROJECT_NAME}", filters={"tags": "2D-simu-20250822"})
    # "$and": [{"tags": "pnet-simu-paper-20250822"}, {"tags": "1D-simu-20250822"}, {"tags": "2D-simu-20250822"}, {"tags": "p1000-20250822"}]})
logging.info(f"Working on joint simulation results: {len(runs)} runs found")

for run in runs: 
    config = run.config
    summary = run.summary
    all_records.append({
        "run_id": run.id,
        "run_name": run.name,
        "model_type": config.get("model_type"),
        "sample_binary": config.get("sample_binary"),
        "n_samples": config.get("num_samples"),
        "datasets": config.get("datasets"),
        "sigma": float(config.get("sigma")),
        "odds_ratio": float(config.get("odds_ratio")),
        "save_dir": config.get("save_dir"),
        "num_class1_samples": config.get("num_class1_samples"),
        "num_class0_samples": config.get("num_class0_samples"),
        "deltaMuGenes": config.get("deltaMuGenes"),
        "mod0_genes": config.get("mod0_genes"),
        "mod1_genes": config.get("mod1_genes"),
    })
    for split in ["train", "validation", "test"]:
        all_records[-1][f"{split}_roc_auc_score"] = summary.get(f"{split}_roc_auc_score")
        all_records[-1][f"{split}_balanced_acc"] = summary.get(f"{split}_balanced_acc")
        all_records[-1][f"{split}_avg_precision"] = summary.get(f"{split}_average_precision_score")
        all_records[-1][f"{split}_acc"] = summary.get(f"{split}_acc")
        all_records[-1][f"{split}_f1_score"] = summary.get(f"{split}_f1_score")
        # all_records[-1][f"{split}_confusion_matrix"] = summary.get(f"{split}_confusion_matrix")
    for split in ["validation", "test"]:
        all_records[-1][f"{split}_feature_importances_path"] = os.path.join(config.get("save_dir"), f"{split}_gene_feature_importances.csv")
        all_records[-1][f"{split}_gene_importances_path"] = os.path.join(config.get("save_dir"), f"{split}_gene_importances.csv")

logging.debug("Convert to a single DataFrame")
df = pd.DataFrame(all_records)

group_by_cols = ["model_type", "odds_ratio", "sigma", "sample_binary", "n_samples"]
logging.debug(f'Add a unique group identifer column by joining together the unique identifiers: {group_by_cols}')
df["group_identifier"] = df.apply(
    lambda row: f"OR-{row['odds_ratio']}_sigma-{row['sigma']}_nSamples-{row['n_samples']}_sampleBinary-{row['sample_binary']}",
    axis=1)

logging.debug("Changing the column order for the final, column-filtered DataFrame")
col_order = group_by_cols + performance_metric_col_order
full_cols = col_order + [c for c in df.columns if c not in col_order]

df_full = df[full_cols].copy()
df = df[col_order]

logging.debug("Make results DF collapsed over replicates")
df_group_summary = make_grouped_summary_with_mean_and_stdev(df, param_cols=group_by_cols, metric_cols=performance_metric_col_order)

logging.info(f"Saving the joint simulation results DataFrame to CSV at {joint_simu_result_savepath}")
df_full.to_csv(joint_simu_result_savepath.replace(".csv", "_full.csv"), float_format="%.3f", index=False, )
df.to_csv(joint_simu_result_savepath, float_format="%.3f", index=False)
df_group_summary.to_csv(grouped_joint_simu_result_savepath, index=False)

display(df.round(3).head(2))
display(df_group_summary.head(2))


# Single-gene spike-in simulations

In [None]:
all_records = []
runs = api.runs(f"{PROJECT_NAME}", filters={"tags": "1D-simu-20250822"})
    # "$and": [{"tags": "pnet-simu-paper-20250822"}, {"tags": "1D-simu-20250822"}, {"tags": "2D-simu-20250822"}, {"tags": "p1000-20250822"}]})
logging.info(f"Working on single-gene simulation results: {len(runs)} runs found")

for run in runs: 
    config = run.config
    summary = run.summary
    all_records.append({
        "run_id": run.id,
        "model_type": config.get("model_type"),
        "datasets": config.get("datasets"),
        "n_features": config.get("n_features"),
        "odds_ratio": float(config.get("odds_ratio")),
        "control_frequency": float(config.get("control_frequency")),
        "save_dir": config.get("save_dir").replace("../../results/", "/mnt/disks/gmiller_data1/pnet/results/"),

        "perturbation_suffix": config.get("perturbation_suffix"),
        "perturbed_data_dir": config.get("perturbed_data_dir"),
        "target_f": os.path.join(config.get("perturbed_data_dir"), f"y_{config.get('perturbation_suffix')}.csv"),
    })

    for split in ["train", "validation", "test"]:
        all_records[-1][f"{split}_roc_auc_score"] = summary.get(f"{split}_roc_auc_score")
        all_records[-1][f"{split}_balanced_acc"] = summary.get(f"{split}_balanced_acc")
        all_records[-1][f"{split}_avg_precision"] = summary.get(f"{split}_average_precision_score")
        all_records[-1][f"{split}_acc"] = summary.get(f"{split}_acc")
        all_records[-1][f"{split}_f1_score"] = summary.get(f"{split}_f1_score")
        # all_records[-1][f"{split}_confusion_matrix"] = summary.get(f"{split}_confusion_matrix")
    for split in ["validation", "test"]:
        all_records[-1][f"{split}_feature_importances_path"] = os.path.join(all_records[-1]["save_dir"], f"{split}_gene_feature_importances.csv")
        all_records[-1][f"{split}_gene_importances_path"] = os.path.join(all_records[-1]["save_dir"], f"{split}_gene_importances.csv")

logging.debug("Convert to a single DataFrame")
df = pd.DataFrame(all_records)

logging.debug("Changing the column order for the final, column-filtered DataFrame")
group_by_cols = ["model_type", "n_features", "odds_ratio", "control_frequency"]
col_order = group_by_cols + performance_metric_col_order
full_cols = col_order + [c for c in df.columns if c not in col_order]
df_full = df[full_cols].copy()

df = df[col_order]

logging.debug("Make results DF collapsed over replicates")
df_group_summary = make_grouped_summary_with_mean_and_stdev(df, param_cols=group_by_cols, metric_cols=performance_metric_col_order)

logging.info(f"Saving the single-gene simulation results DataFrame to CSV at {single_gene_simu_result_savepath}")
df_full.to_csv(single_gene_simu_result_savepath.replace(".csv", "_full.csv"), float_format="%.3f", index=False, )
df.to_csv(single_gene_simu_result_savepath, float_format="%.3f", index=False)
df_group_summary.to_csv(grouped_single_gene_simu_result_savepath, index=False)

display(df.round(3).head(2))
display(df_group_summary.head(10))

# Empirical results: P1000 somatic +/- germline

In [None]:
all_records = []
runs = api.runs(f"{PROJECT_NAME}", filters={"tags": "p1000-20250822"})
    # "$and": [{"tags": "pnet-simu-paper-20250822"}, {"tags": "1D-simu-20250822"}, {"tags": "2D-simu-20250822"}, {"tags": "p1000-20250822"}]})
logging.info(f"Working on empirical P1000 results: {len(runs)} runs found")

for run in runs: 
    config = run.config
    summary = run.summary
    all_records.append({
        "run_id": run.id,
        "model_type": config.get("model_type"),
        "datasets": config.get("datasets"),
        "save_dir": config.get("save_dir").replace("../../results/", "/mnt/disks/gmiller_data1/pnet/results/"),
        "input_data_dir": config.get("input_data_dir"),
    })

    for split in ["train", "validation", "test"]:
        all_records[-1][f"{split}_roc_auc_score"] = summary.get(f"{split}_roc_auc_score")
        all_records[-1][f"{split}_balanced_acc"] = summary.get(f"{split}_balanced_acc")
        all_records[-1][f"{split}_avg_precision"] = summary.get(f"{split}_average_precision_score")
        all_records[-1][f"{split}_acc"] = summary.get(f"{split}_acc")
        all_records[-1][f"{split}_f1_score"] = summary.get(f"{split}_f1_score")
        # all_records[-1][f"{split}_confusion_matrix"] = summary.get(f"{split}_confusion_matrix")
    for split in ["validation", "test"]:
        all_records[-1][f"{split}_feature_importances_path"] = os.path.join(all_records[-1]["save_dir"], f"{split}_gene_feature_importances.csv")
        all_records[-1][f"{split}_gene_importances_path"] = os.path.join(all_records[-1]["save_dir"], f"{split}_gene_importances.csv")

logging.debug("Convert to a single DataFrame")
df = pd.DataFrame(all_records)

logging.debug("Changing the column order for the final, column-filtered DataFrame")
group_by_cols = ["model_type","datasets"]
col_order = group_by_cols + performance_metric_col_order
full_cols = col_order + [c for c in df.columns if c not in col_order]

df_full = df[full_cols].copy()
df = df[col_order]
logging.debug("Make results DF collapsed over replicates")
df_group_summary = make_grouped_summary_with_mean_and_stdev(df, param_cols=group_by_cols, metric_cols=performance_metric_col_order)

logging.info(f"Saving the empirical P1000 results DataFrame to CSV at {p1000_result_savepath}")
df_full.to_csv(p1000_result_savepath.replace(".csv", "_full.csv"), float_format="%.3f", index=False, )
df.to_csv(p1000_result_savepath, float_format="%.3f", index=False, )
df_group_summary.to_csv(grouped_p1000_result_savepath, index=False)

display(df.round(3).head(2))
display(df_group_summary.head(2))

In [None]:
print(df_full.shape)
print(df.shape)
print(df_group_summary.shape)