In [None]:
import os

import pandas as pd
import yaml


# Constructing single-gene spike-in simulated data
First, run file in `src/data_processing/make_perturbed_genotype_datasets.py`. This will produce a folder containing all the perturbed dataset versions.
Here, we build two config files so that we can run a wandb sweep on the perturbed data:
1. a sweep config YAML
2. a YAML that specifies the perturbed data and perturbed target pairs (similar to how we have a data.yaml when we run the empirical P1000 data).

### Extended control_freqs
Step 1: update the dictionary of perturbed_data file names

Step 2: create new wandb sweep YAML file

In [None]:
# OR x control_freq ranges on perturbed somatic_mut
WANDB_RUN = "hvu4lf2q"
perturbed_data_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/perturbed_data-somatic_mut-or_x_control_freqs.yaml"
sweep_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/sweep_perturbed_data-somatic_mut-or_x_control_freqs.yaml"

# reduced number of features in the perturbed somatic_mut DF to test importance of initial feature selection in preprocessing
# WANDB_RUN = "8b3qnf2v"  # (N.B.: actually RR not OR)
# perturbed_data_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/perturbed_data-somatic_mut-n_features.yaml"
# sweep_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/sweep_perturbed_data-somatic_mut-n_features.yaml"

# sweep_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/sweep_perturbed_data-somatic_mut-n_features-only3seeds.yaml"

# Extended control_freqs on perturbed somatic_mut
# WANDB_RUN = "xiq2m609"  # (N.B.: actually RR not OR)
# perturbed_data_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/perturbed_data-somatic_mut-extended_control_freqs.yaml"
# sweep_config_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/sweep_perturbed_data-somatic_mut-extended_control_freqs.yaml"

# WANDB_RUN = "mqivqsdi" # Initial OR x control_freq ranges on perturbed somatic_mut (N.B.: actually RR not OR)

In [None]:
# Path to the perturbation summary CSV
summary_csv_path = f"../../pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-{WANDB_RUN}/perturbation_summary_{WANDB_RUN}.csv"

# Load the perturbation summary CSV
summary_df = pd.read_csv(summary_csv_path)

# Create a dictionary for the YAML file
perturbed_data_dict = {
    "perturbed_somatic_mut": {
        os.path.splitext(os.path.basename(row["out_data_file"]))[0].replace("somatic_mut_", ""): {
            "data_file": os.path.basename(row["out_data_file"]),
            "target_file": os.path.basename(row["out_target_file"]),
        }
        for _, row in summary_df.iterrows()
    }
}

# Save the dictionary to a YAML file
with open(perturbed_data_config_path, "w") as yaml_file:
    yaml.dump(perturbed_data_dict, yaml_file, default_flow_style=False)

print(f"perturbed data pairs YAML file saved to {perturbed_data_config_path}")

In [None]:
# Create wandb sweep YAML config file
summary_csv_path = (
    f"/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-{WANDB_RUN}/perturbation_summary_{WANDB_RUN}.csv"
)
data_split_dir = f"/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-{WANDB_RUN}"
perturbed_data_dir = f"/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-{WANDB_RUN}"
perturbed_data_wandb_id = WANDB_RUN  # The wandb run ID for the perturbed data
input_data_wandb_id = "u5yt90p1"  # The wandb run ID for the input data (not sure if I am using any)
input_data_dir = "/mnt/disks/gmiller_data1/pnet_germline/processed/wandb-group-data_prep_germline_tier12_and_somatic/converted-IDs-to-somatic_imputed-germline_True_imputed-somatic_False_paired-samples-True/wandb-run-id-u5yt90p1"
input_data_config_f = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/data.yaml"

summary_df = pd.read_csv(summary_csv_path)

sweep_config = {
    "program": "/mnt/disks/gmiller_data1/pnet-simu-private/src/pnet/run_model_on_perturbed_data.py",
    "method": "grid",
    "parameters": {
        "datasets": {"value": "somatic_mut"},
        "perturbed_data_dir": {"value": perturbed_data_dir},
        "perturbed_data_wandb_id": {"value": perturbed_data_wandb_id},
        "perturbed_data_config_f": {"value": perturbed_data_config_path},
        "data_config_f": {"value": input_data_config_f},
        "evaluation_set": {"values": ["validation"]},
        "model_type": {"values": ["rf", "pnet"]},
        "wandb_group": {"value": "single_gene_perturbation_002"},
        "wandb_project": {"value": "prostate_met_status"},
        "seed": {"values": [123, 42, 155]},
        "input_data_wandb_id": {"value": input_data_wandb_id},
        "input_data_dir": {"value": input_data_dir},
        "cpus": {"value": 2},
        "perturbation_suffix": {
            "values": [
                os.path.splitext(os.path.basename(row["out_data_file"]))[0].replace("somatic_mut_", "")
                for _, row in summary_df.iterrows()
            ]
        },
    }
}

with open(sweep_config_path, "w") as f:
    yaml.dump(sweep_config, f)
print(f"wandb sweep YAML file saved to {sweep_config_path}")

### First sweep: reasonable range of OR and control_frequencies on somatic_mut

In [None]:
# Creat a perturbed_data.yaml config file for the perturbed data+target combinations indexed by the suffix of the data file
import pandas as pd
import os
import yaml

# Path to the perturbation summary CSV
summary_csv_path = "../../pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-mqivqsdi/perturbation_summary_mqivqsdi.csv"

# Load the perturbation summary CSV
summary_df = pd.read_csv(summary_csv_path)

# Create a dictionary for the YAML file
perturbed_data_dict = {
    "perturbed_somatic_mut": {
        os.path.splitext(os.path.basename(row["out_data_file"]))[0].replace("somatic_mut_", ""): {
            "data_file": os.path.basename(row["out_data_file"]),
            "target_file": os.path.basename(row["out_target_file"]),
        }
        for _, row in summary_df.iterrows()
    }
}

# Save the dictionary to a YAML file
output_yaml_path = "/mnt/disks/gmiller_data1/pnet-simu-private/configs/perturbed_data.yaml"
with open(output_yaml_path, "w") as yaml_file:
    yaml.dump(perturbed_data_dict, yaml_file, default_flow_style=False)

print(f"YAML file saved to {output_yaml_path}")

In [None]:
# Create wandb sweep YAML config file
import pandas as pd
import yaml

summary_csv_path = (
    "/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-mqivqsdi/perturbation_summary_mqivqsdi.csv"
)

data_split_dir = "/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-mqivqsdi"
perturbed_data_dir = "/mnt/disks/gmiller_data1/pnet_germline/processed/perturbed_genotype_datasets/p1000_somatic_mut/wandb-run-id-mqivqsdi"
perturbed_data_wandb_id = "mqivqsdi"  # The wandb run ID for the perturbed data
input_data_wandb_id = "u5yt90p1"  # The wandb run ID for the input data (not sure if I am using any)
input_data_dir = "/mnt/disks/gmiller_data1/pnet_germline/processed/wandb-group-data_prep_germline_tier12_and_somatic/converted-IDs-to-somatic_imputed-germline_True_imputed-somatic_False_paired-samples-True/wandb-run-id-u5yt90p1"

df = pd.read_csv(summary_csv_path)
sweep_config = {
    "program": "run_model_on_perturbed_data.py",
    "method": "grid",
    "parameters": {
        "datasets": {"value": "somatic_mut"},
        "perturbed_data_dir": {"value": f"{perturbed_data_dir}"},
        "perturbed_data_wandb_id": {"value": f"{perturbed_data_wandb_id}"},
        "perturbed_data_config_f": {"value": "/mnt/disks/gmiller_data1/pnet-simu-private/configs/perturbed_data.yaml"},
        "data_config_f": {"value": "/mnt/disks/gmiller_data1/pnet-simu-private/configs/data.yaml"},
        "evaluation_set": {"values": ["validation"]},
        "model_type": {"values": ["pnet", "rf"]},
        "wandb_group": {"value": "pnet_perturbed_data"},
        "wandb_project": {"value": "prostate_met_status"},
        "seed": {"values": [123, 42, 155, 23, 356]},
        "input_data_wandb_id": {"value": f"{input_data_wandb_id}"},
        "input_data_dir": {"value": f"{input_data_dir}"},
        "cpus": {"value": 2},
        "perturbation_suffix": {
            "values": [
                os.path.splitext(os.path.basename(row["out_data_file"]))[0].replace("somatic_mut_", "")
                for _, row in df.iterrows()
            ]
        },
    }
}

with open("../configs/sweep_perturbed_somatic_mut.yaml", "w") as f:
    yaml.dump(sweep_config, f)