# Programmatically generate samples.json files

The pipeline expects a file `snakemake/config/samples.json` that defines which data it'll download and process. This notebook illustrates how to programmatically generate such files. 

In [3]:
import json
import pandas as pd
import logging
import pathlib as pl
import requests

## Set up template files

In [4]:
plate_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/plate.csv.gz"
well_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/well.csv.gz"
compound_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/compound.csv.gz"
orf_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/orf.csv.gz"

meta_dir = pl.Path("../metadata/")
meta_dir.mkdir(parents=True, exist_ok=True)

plate_path = meta_dir.joinpath("plate.csv.gz")
well_path = meta_dir.joinpath("well.csv.gz")
compound_path = meta_dir.joinpath("compound.csv.gz")
orf_path = meta_dir.joinpath("orf.csv.gz")

meta_links = {
    "plate": (plate_path, plate_link),
    "well": (well_path, well_link),
    "compound": (compound_path, compound_link),
    "orf": (orf_path, orf_link),
}

for (path, link) in meta_links.values():
    if not path.is_file():
        file = requests.get(link)
        with path.open("wb") as f:
            f.write(file.content)

plate = pd.read_csv(meta_links["plate"][0])
well = pd.read_csv(meta_links["well"][0])
compound = pd.read_csv(meta_links["compound"][0])
orf = pd.read_csv(meta_links["orf"][0])

profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

loaddata_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/load_data_csv/"
    "{Metadata_Batch}/{Metadata_Plate}/load_data_with_illum.parquet"
)

df = plate.merge(right=well)
df.head(3)

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
0,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A02,JCP2022_033924
1,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A03,JCP2022_085227
2,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A04,JCP2022_033924


## Example 1A: Minimal config file for testing: 1 well per plate
--> roughly 200 GB

In [28]:
def get_n_random_wells(group, n):
    selected_wells = group.sample(n=min(len(group), n))
    return group[group["Metadata_Well"].isin(selected_wells["Metadata_Well"])]

sampled_df = df.groupby(["Metadata_Source", "Metadata_Plate"], group_keys=False).apply(get_n_random_wells, 1)
sampled_df

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
35993,source_1,Batch4_20221012,UL000081,COMPOUND,L11,JCP2022_011032
37689,source_1,Batch4_20221012,UL000083,COMPOUND,R12,JCP2022_096694
24519,source_1,Batch3_20221010,UL000087,COMPOUND,R30,JCP2022_079144
25927,source_1,Batch3_20221010,UL000089,COMPOUND,Q12,JCP2022_112138
27030,source_1,Batch3_20221010,UL000091,COMPOUND,I11,JCP2022_014752
...,...,...,...,...,...,...
1089538,source_9,20211103-Run16,GR00004417,COMPOUND,R46,JCP2022_107288
1090108,source_9,20211103-Run16,GR00004418,COMPOUND,AC40,JCP2022_112057
1091493,source_9,20211103-Run16,GR00004419,COMPOUND,A33,JCP2022_027256
1093178,source_9,20211103-Run16,GR00004420,COMPOUND,AC38,JCP2022_020625


In [29]:
selected_df = sampled_df[["Metadata_Source", "Metadata_Batch", "Metadata_Plate", "Metadata_Well"]]
json_str = selected_df.to_json(orient="records")
data_dict = json.loads(json_str)
structured_data = {"samples": data_dict}
structured_json_str = json.dumps(structured_data, indent=4)

with open("../snakemake/config/samples.json", "w") as file:
    file.write(structured_json_str)

## Example 1B: 5 wells per plate, 3 batches per source, 3 plates per batch 

In [27]:
sampled_df = []

for source in df["Metadata_Source"].unique():
    s = df.query(f"Metadata_Source == '{source}'")
    for batch in s["Metadata_Batch"].unique()[:3]:
        ss = s.query(f"Metadata_Batch == '{batch}'")
        for plate in ss["Metadata_Plate"].unique()[:3]:
            sss = ss.query(f"Metadata_Plate == '{plate}'")
            sampled_df.append(get_n_random_wells(sss, 5))

sampled_df = pd.concat(sampled_df)
sampled_df

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
1164,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,T16,JCP2022_033924
435,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,D23,JCP2022_033924
201,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,AD19,JCP2022_033924
999,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,P35,JCP2022_033924
807,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,L27,JCP2022_033924
...,...,...,...,...,...,...
957695,source_9,20210831-Run7,GR00003332,COMPOUND,X11,JCP2022_058747
957056,source_9,20210831-Run7,GR00003332,COMPOUND,J44,JCP2022_095076
957712,source_9,20210831-Run7,GR00003332,COMPOUND,X28,JCP2022_038790
957497,source_9,20210831-Run7,GR00003332,COMPOUND,T05,JCP2022_075623


In [26]:
selected_df = sampled_df[["Metadata_Source", "Metadata_Batch", "Metadata_Plate", "Metadata_Well"]]
json_str = selected_df.to_json(orient="records")
data_dict = json.loads(json_str)
structured_data = {"samples": data_dict}
structured_json_str = json.dumps(structured_data, indent=4)

with open("../snakemake/config/samples.json", "w") as file:
    file.write(structured_json_str)

## Example 2: Generate source-stratified config for testing

In [16]:
grouped = df.groupby(["Metadata_Source", "Metadata_Batch", "Metadata_Plate"])
collapsed_df = grouped["Metadata_Well"].nunique().reset_index()
collapsed_df = collapsed_df.rename(columns={"Metadata_Well": "Unique_Well_Count"})
collapsed_df = collapsed_df.query("Unique_Well_Count > 250")
collapsed_df

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Unique_Well_Count
0,source_1,Batch1_20221004,UL000109,1472
1,source_1,Batch1_20221004,UL001641,1472
2,source_1,Batch1_20221004,UL001643,1472
3,source_1,Batch1_20221004,UL001645,1472
4,source_1,Batch1_20221004,UL001651,1472
...,...,...,...,...
2373,source_9,20211103-Run16,GR00004417,1536
2374,source_9,20211103-Run16,GR00004418,1536
2375,source_9,20211103-Run16,GR00004419,1536
2376,source_9,20211103-Run16,GR00004420,1536


In [13]:
total_desired_wells = 150000  # Adjust this number as needed
num_sources = collapsed_df["Metadata_Source"].nunique()
wells_per_source = total_desired_wells // num_sources

sampled_df = pd.DataFrame()

for source in collapsed_df["Metadata_Source"].unique():
    source_data = collapsed_df[collapsed_df["Metadata_Source"] == source]
    source_sample = pd.DataFrame()

    for index, row in source_data.iterrows():
        row_df = pd.DataFrame([row])
        source_sample = pd.concat([source_sample, row_df], ignore_index=True)

        # Check if the quota is reached or exceeded
        if source_sample["Unique_Well_Count"].sum() >= wells_per_source:
            break

    sampled_df = pd.concat([sampled_df, source_sample], ignore_index=True)

sampled_df.groupby("Metadata_Source")['Unique_Well_Count'].sum()

Metadata_Source
source_1     13130
source_10    12672
source_11    12667
source_13    12672
source_2     12663
source_3     12672
source_4     12671
source_5     12672
source_6     12672
source_7     12666
source_8     12672
source_9     13824
Name: Unique_Well_Count, dtype: int64

In [15]:
selected_df = sampled_df[["Metadata_Source", "Metadata_Batch", "Metadata_Plate"]]
json_str = selected_df.to_json(orient="records")
data_dict = json.loads(json_str)
structured_data = {"samples": data_dict}
structured_json_str = json.dumps(structured_data, indent=4)

with open("../snakemake/config/samples.json", "w") as file:
    file.write(structured_json_str)