# Prepare well-resolution TARGET2 CellProfiler features

In [25]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Not used but needed for downloads from Broad S3
import pyarrow
import fsspec
import s3fs

# ignore mix type warnings from pandas
import warnings

warnings.filterwarnings("ignore")

In [9]:
def summarize_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Summarize NaN, positive infinity, and negative infinity occurrences in a DataFrame.

    :param df: pandas DataFrame to summarize.
    :return: pandas DataFrame summarizing NaN, positive infinity, and negative infinity counts.
    """
    nan_counts = df.isna().sum().sum()
    pos_inf_counts = (df == np.inf).sum().sum()
    neg_inf_counts = (df == -np.inf).sum().sum()

    summary_df = pd.DataFrame({
        'NaN_count': [nan_counts],
        'Pos_inf_count': [pos_inf_counts],
        'Neg_inf_count': [neg_inf_counts]
    })

    return summary_df

def impute_source(source, df):
    df_source = df[df["Metadata_Source"] == source]

    # Separate numeric and non-numeric columns
    numeric_cols = df_source.select_dtypes(include=[np.number]).columns
    non_numeric_cols = df_source.select_dtypes(exclude=[np.number]).columns
    df_numeric = df_source[numeric_cols]
    df_non_numeric = df_source[non_numeric_cols]

    df_numeric = df_numeric.replace([np.inf, -np.inf], np.nan)

    imputer = KNNImputer()

    # doesn't actually update per-column because the imputer does full-df, but that'd be slower so we don't
    with tqdm(total=len(df_numeric), desc=f"Imputing Source: {source}") as pbar:
        imputed_data = imputer.fit_transform(df_numeric)
        pbar.update(len(df_numeric))

    df_imputed_numeric = pd.DataFrame(imputed_data, columns=numeric_cols)

    # Concatenate imputed numeric columns with non-numeric columns
    df_imputed = pd.concat([df_non_numeric.reset_index(drop=True), df_imputed_numeric.reset_index(drop=True)], axis=1)

    return df_imputed

In [10]:
plates = pd.read_csv("../metadata/plate.csv.gz")
wells = pd.read_csv("../metadata/well.csv.gz")
compounds = pd.read_csv("../metadata/compound.csv.gz")

In [11]:
profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

loaddata_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/load_data_csv/"
    "{Metadata_Batch}/{Metadata_Plate}/load_data_with_illum.parquet"
)

In [12]:
target2_plates = plates.query("Metadata_PlateType == 'TARGET2'")

In [13]:
target2_plates

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType
161,source_10,2021_08_03_U2OS_48_hr_run12,Dest210726-160150,TARGET2
180,source_10,2021_08_09_U2OS_48_hr_run13,Dest210727-153003,TARGET2
195,source_10,2021_08_12_U2OS_48_hr_run15,Dest210803-153958,TARGET2
213,source_10,2021_08_17_U2OS_48_hr_run16,Dest210809-134534,TARGET2
231,source_10,2021_08_20_U2OS_48_hr_run17,Dest210810-173723,TARGET2
...,...,...,...,...
2315,source_9,20210915-Run10,GR00003310,TARGET2
2324,source_9,20210918-Run11,GR00004371,TARGET2
2341,source_9,20211013-Run14,GR00003283,TARGET2
2354,source_9,20211102-Run15,GR00004395,TARGET2


## Download well-resolution CellProfiler features for all TARGET2 plates

In [19]:
%%time

try:
    target2_features = pd.read_parquet("../data/target2_wellres_features.parquet")
except:

    dframes = []
    columns = None # so we get all features

    def fetch_parquet(row):
        s3_path = profile_formatter.format(**row._asdict())
        return pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)

    with ThreadPoolExecutor() as executor:
        dframes = list(tqdm(executor.map(fetch_parquet, target2_plates.itertuples(index=False)), total=len(target2_plates)))

    target2_features = pd.concat(dframes)
    target2_features.to_parquet("../data/target2_wellres_features.parquet")

target2_features


  0%|          | 0/141 [00:00<?, ?it/s]

CPU times: user 1min 45s, sys: 28.2 s, total: 2min 13s
Wall time: 1min 18s


Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,source_10,Dest210726-160150,A01,3846.4,8135.6,594.88,544.30,506.29,454.05,550.31,...,52.879,49.352,50.861,51.543,50.538,51.884,52.263,52.625,52.144,53.302
1,source_10,Dest210726-160150,A02,3484.2,7088.1,541.40,539.61,458.50,455.57,499.72,...,62.107,55.949,59.508,60.461,59.221,60.675,61.428,62.315,61.100,62.480
2,source_10,Dest210726-160150,A03,3281.9,6740.3,540.91,564.09,458.88,483.44,498.92,...,54.887,50.217,52.887,53.876,52.707,53.984,54.283,55.613,54.511,55.356
3,source_10,Dest210726-160150,A04,3476.5,7233.5,554.90,491.65,472.11,406.53,512.92,...,56.545,52.062,56.238,57.078,55.913,57.400,57.872,58.614,57.834,58.898
4,source_10,Dest210726-160150,A05,3592.1,7581.8,565.46,555.42,480.46,468.08,522.43,...,64.136,58.298,62.002,63.093,61.794,63.153,63.637,64.359,63.523,64.394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,source_9,GR00004409,Z44,2501.1,4571.1,560.30,560.26,493.83,493.32,526.64,...,54.051,53.101,51.344,51.742,51.299,51.507,51.875,53.164,51.923,52.834
1532,source_9,GR00004409,Z45,2497.0,4529.4,555.90,563.43,491.44,495.61,523.31,...,52.398,50.418,49.035,49.288,49.087,49.531,49.877,51.324,49.960,51.313
1533,source_9,GR00004409,Z46,2470.0,4479.2,548.36,578.95,482.96,512.79,515.18,...,60.019,58.125,56.818,57.189,56.923,57.286,57.917,59.310,57.726,59.307
1534,source_9,GR00004409,Z47,2725.4,5025.1,553.86,565.84,485.54,495.00,519.22,...,52.872,51.857,49.610,49.757,49.552,50.035,50.196,51.243,50.180,51.654


## Add batch info again

In [20]:
target2_features = target2_features.merge(
    target2_plates,
    left_on=["Metadata_Source", "Metadata_Plate"],
    right_on=["Metadata_Source", "Metadata_Plate"],
    how="left"
).drop("Metadata_PlateType", axis=1)

### Impute missing

In [21]:
summarize_missing_values(target2_features)

Unnamed: 0,NaN_count,Pos_inf_count,Neg_inf_count
0,5261,33,0


In [24]:
%%time

sources = target2_features["Metadata_Source"].unique()
imputed_dfs = []

# Using ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=len(sources)) as executor:
    # Future to source mapping
    future_to_source = {executor.submit(impute_source, source, target2_features): source for source in sources}

    # Collecting results
    for future in as_completed(future_to_source):
        source = future_to_source[future]
        try:
            imputed_df = future.result()
            imputed_dfs.append(imputed_df)
        except Exception as e:
            print(f"Source {source} generated an exception: {e}")

target2_features_imputed = pd.concat(imputed_dfs, ignore_index=True)

Imputing Source: source_13:   0%|          | 0/2283 [00:00<?, ?it/s]

Imputing Source: source_10:   0%|          | 0/2302 [00:00<?, ?it/s]

Imputing Source: source_8:   0%|          | 0/1536 [00:00<?, ?it/s]

Imputing Source: source_11:   0%|          | 0/2684 [00:00<?, ?it/s]

Imputing Source: source_7:   0%|          | 0/2688 [00:00<?, ?it/s]

Imputing Source: source_2:   0%|          | 0/3828 [00:00<?, ?it/s]

Imputing Source: source_6:   0%|          | 0/8064 [00:00<?, ?it/s]

Imputing Source: source_3:   0%|          | 0/9599 [00:00<?, ?it/s]

Imputing Source: source_4:   0%|          | 0/8442 [00:00<?, ?it/s]

Imputing Source: source_5:   0%|          | 0/9214 [00:00<?, ?it/s]

Imputing Source: source_9:   0%|          | 0/13824 [00:00<?, ?it/s]

CPU times: user 3min 9s, sys: 10.1 s, total: 3min 19s
Wall time: 1min 33s


In [26]:
summarize_missing_values(target2_features_imputed)

Unnamed: 0,NaN_count,Pos_inf_count,Neg_inf_count
0,0,0,0


In [27]:
target2_features_imputed

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,source_8,A1170384,A01,J1,3683.200000,7920.000000,570.710000,565.870000,482.460000,478.170000,...,346.150000,332.330000,333.190000,340.070000,334.370000,339.540000,341.740000,348.540000,343.390000,348.400000
1,source_8,A1170384,A02,J1,3519.000000,7427.500000,574.880000,586.630000,488.560000,502.490000,...,339.000000,322.480000,324.810000,330.830000,326.090000,330.720000,333.160000,340.850000,334.480000,340.000000
2,source_8,A1170384,A03,J1,3551.500000,7493.600000,548.580000,547.250000,463.350000,461.680000,...,376.540000,358.890000,358.720000,365.590000,358.390000,365.570000,368.440000,376.590000,369.090000,375.590000
3,source_8,A1170384,A04,J1,3494.400000,7360.200000,561.720000,580.460000,477.120000,494.980000,...,374.860000,354.350000,355.720000,362.750000,356.280000,362.630000,366.280000,373.050000,366.880000,374.710000
4,source_8,A1170384,A05,J1,3625.800000,7620.300000,552.490000,576.340000,466.990000,489.870000,...,389.630000,375.510000,371.510000,378.740000,371.830000,378.270000,380.940000,387.840000,382.240000,388.370000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64459,source_13,CP-CC9-R7-29,P20,20221120_Run6,2735.141361,5259.052356,679.311932,572.702122,608.692202,502.787820,...,7.437796,7.252381,6.907879,7.043682,6.922703,7.048655,7.116519,7.300999,7.128542,7.313541
64460,source_13,CP-CC9-R7-29,P21,20221120_Run6,2492.848859,4878.090779,647.097909,524.178232,578.300856,458.030418,...,7.030951,6.443306,6.660565,6.791647,6.668570,6.780820,6.862487,7.036637,6.880732,6.992730
64461,source_13,CP-CC9-R7-29,P22,20221120_Run6,2701.863939,5179.042455,653.885934,579.227366,584.200512,509.318159,...,7.896472,7.706000,7.378235,7.537031,7.396785,7.522082,7.606077,7.795566,7.614485,7.796514
64462,source_13,CP-CC9-R7-29,P23,20221120_Run6,2755.432136,5216.364957,646.266520,572.009049,576.276940,501.680011,...,5.636992,5.548971,5.338362,5.446202,5.354443,5.438343,5.490677,5.610534,5.494503,5.608818


## Add metadata features

In [28]:
wc = wells.merge(compounds, on="Metadata_JCP2022", how="left")
target2_with_metadata = target2_features_imputed.merge(wc, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"], how="left")


In [29]:
target2_with_metadata

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,...,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI
0,source_8,A1170384,A01,J1,3683.200000,7920.000000,570.710000,565.870000,482.460000,478.170000,...,340.070000,334.370000,339.540000,341.740000,348.540000,343.390000,348.400000,JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,source_8,A1170384,A02,J1,3519.000000,7427.500000,574.880000,586.630000,488.560000,502.490000,...,330.830000,326.090000,330.720000,333.160000,340.850000,334.480000,340.000000,JCP2022_050797,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,source_8,A1170384,A03,J1,3551.500000,7493.600000,548.580000,547.250000,463.350000,461.680000,...,365.590000,358.390000,365.570000,368.440000,376.590000,369.090000,375.590000,JCP2022_050997,LPYXWGMUVRGUOY-UHFFFAOYSA-N,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,source_8,A1170384,A04,J1,3494.400000,7360.200000,561.720000,580.460000,477.120000,494.980000,...,362.750000,356.280000,362.630000,366.280000,373.050000,366.880000,374.710000,JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,source_8,A1170384,A05,J1,3625.800000,7620.300000,552.490000,576.340000,466.990000,489.870000,...,378.740000,371.830000,378.270000,380.940000,387.840000,382.240000,388.370000,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64459,source_13,CP-CC9-R7-29,P20,20221120_Run6,2735.141361,5259.052356,679.311932,572.702122,608.692202,502.787820,...,7.043682,6.922703,7.048655,7.116519,7.300999,7.128542,7.313541,JCP2022_032357,HSUGRBWQSSZJOP-UHFFFAOYSA-N,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
64460,source_13,CP-CC9-R7-29,P21,20221120_Run6,2492.848859,4878.090779,647.097909,524.178232,578.300856,458.030418,...,6.791647,6.668570,6.780820,6.862487,7.036637,6.880732,6.992730,JCP2022_047545,KXBDTLQSDKGAEB-UHFFFAOYSA-N,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
64461,source_13,CP-CC9-R7-29,P22,20221120_Run6,2701.863939,5179.042455,653.885934,579.227366,584.200512,509.318159,...,7.537031,7.396785,7.522082,7.606077,7.795566,7.614485,7.796514,JCP2022_043099,JZFPYUNJRRFVQU-UHFFFAOYSA-N,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
64462,source_13,CP-CC9-R7-29,P23,20221120_Run6,2755.432136,5216.364957,646.266520,572.009049,576.276940,501.680011,...,5.446202,5.354443,5.438343,5.490677,5.610534,5.494503,5.608818,JCP2022_050516,LNFZRMDSZJCZTG-UHFFFAOYSA-N,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...


## Get standardized compound metadata

In [30]:
target2_compound_metadata = pd.read_csv(
    "../metadata/repurposing_samples_20200324_standardized.csv.gz",
    compression="gzip"
)
target2_compound_metadata = target2_compound_metadata[["InChIKey_standardized", "SMILES_standardized", "InChI_standardized", "pubchem_cid", "pert_iname"]].drop_duplicates()
target2_compound_metadata.pubchem_cid = pd.to_numeric(target2_compound_metadata.pubchem_cid, errors="coerce").fillna(0).astype(int)

print(target2_compound_metadata.shape)
target2_compound_metadata.head(3)

(6783, 5)


Unnamed: 0,InChIKey_standardized,SMILES_standardized,InChI_standardized,pubchem_cid,pert_iname
0,VMWNQDUVQKEIOC-UHFFFAOYSA-N,CN1CCc2cccc3c2C1Cc1ccc(O)c(O)c1-3,InChI=1S/C17H17NO2/c1-18-8-7-10-3-2-4-12-15(10...,6005,(R)-(-)-apomorphine
4,HJORMJIFDVBMOB-UHFFFAOYSA-N,COc1ccc(C2CNC(=O)C2)cc1OC1CCCC1,InChI=1S/C16H21NO3/c1-19-14-7-6-11(12-9-16(18)...,448055,(R)-(-)-rolipram
7,KPYSYYIEGFHWSV-UHFFFAOYSA-N,NCC(CC(=O)O)c1ccc(Cl)cc1,InChI=1S/C10H12ClNO2/c11-9-3-1-7(2-4-9)8(6-12)...,6918881,(R)-baclofen


## Add Broad TARGET2 and CLUE MOA info

In [31]:
target_moa = pd.read_csv(
    "https://raw.githubusercontent.com/jump-cellpainting/JUMP-MOA/master/JUMP-MOA_compound_metadata.tsv",
    sep="\t"
)
target_moa.pubchem_cid = pd.to_numeric(target_moa.pubchem_cid, errors="coerce").fillna(0).astype(int)

print(target_moa.shape)
target_moa.head(3)

(91, 8)


Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,moa,pert_type,control_type,smiles
0,BRD-K80935598-001-01-1,ZYVXTMKTGDARKR-UHFFFAOYSA-N,AZ191,72716071,DYRK inhibitor,trt,,COc1cc(ccc1Nc1nccc(n1)-c1cn(C)c2cnccc12)N1CCN(...
1,BRD-K85776940-001-01-9,ODADKLYLWWCHNB-LDYBVBFYSA-N,delta-Tocotrienol,5282350,HMGCR inhibitor,trt,,CC(C)=CCC\C(C)=C\CC\C(C)=C\CC[C@]1(C)CCc2cc(O)...
2,BRD-K25611237-001-02-1,QDBVSOZTVKXUES-UHFFFAOYSA-N,ML324,44143209,histone lysine demethylase inhibitor,trt,,CN(C)CCCNC(=O)c1ccc(cc1)-c1cc(O)c2ncccc2c1


In [32]:
clue_moa = pd.read_csv(
    "../metadata/repurposing_drugs_20180907.txt",
    sep="\t"
)
clue_moa

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,
2,A-1120,Preclinical,retinoid receptor ligand,RBP4,,
3,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,
4,A-33903,Phase 2,,,,
...,...,...,...,...,...,...
6120,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6121,80841-78-7,Preclinical,,,,
6122,9-aminoacridine,Preclinical,,,,
6123,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,


## Merge and deal with overlapping columns

In [33]:
# first create empty pert_iname df, then merge in metadata
df = pd.DataFrame(index=target2_compound_metadata["pert_iname"].str.lower())

target_moa["pert_iname"] = target_moa["pert_iname"].str.lower()
target_moa.set_index("pert_iname", drop=False, inplace=True, verify_integrity=True)

clue_moa["pert_iname"] = clue_moa["pert_iname"].str.lower()
clue_moa.set_index("pert_iname", drop=False, inplace=True, verify_integrity=True)


In [34]:
df_target = df.merge(target_moa, how="left", right_index=True, left_index=True)
df_target_clue = df_target.merge(clue_moa, how="left", right_index=True, left_index=True)

def merge_moa(row):
    moa_x = row["moa_x"]
    moa_y = row["moa_y"]

    if pd.isna(moa_x) and pd.isna(moa_y):
        return np.nan

    moas = set([moa for moa in [moa_x, moa_y] if not pd.isna(moa)])
    return "|".join(moas) if moas else np.nan

df_target_clue["moa"] = df_target_clue.apply(merge_moa, axis=1)
df_target_clue = df_target_clue[[
    "clinical_phase",
    "target",
    "disease_area",
    "indication",
    "moa",
]]

# Remove all NaN-rows for more efficient downstream 
df_target_clue = df_target_clue.dropna(how="all")

# Add index back as column so we can merge on it
df_target_clue.reset_index(inplace=True)

df_target_clue = df_target_clue.drop_duplicates()

df_target_clue

Unnamed: 0,pert_iname,clinical_phase,target,disease_area,indication,moa
0,"1,12-besm",Phase 2,,,,polyamine biosynthesis inhibitor
1,"1,2,3,4,5,6-hexabromocyclohexane",Preclinical,JAK2,,,JAK inhibitor
2,"1,3-dipropyl-8-phenylxanthine",Preclinical,,,,adenosine receptor antagonist
3,"1,4-butanediol",Phase 1,MAN1B1|PLA2G2A|PLA2G2E,,,benzodiazepine receptor agonist|gamma hydroxyb...
4,"1,5-dicaffeoylquinic-acid",Phase 1,,,,
...,...,...,...,...,...,...
5924,zotarolimus,Launched,FKBP1A,cardiology,coronary artery restenosis,mTOR inhibitor
5925,zotepine,Launched,ADRA2B|DRD2|DRD3|DRD4|HRH1|HTR1A|HTR1B|HTR1D|H...,neurology/psychiatry,schizophrenia,dopamine receptor antagonist|serotonin recepto...
5926,zoxazolamine,Phase 2,,,,myorelaxant
5927,zstk-474,Phase 1/Phase 2,PIK3CB|PIK3CD|PIK3CG,,,PI3K inhibitor


## Merge into used compounds

In [35]:
target2_compound_metadata["pert_iname"] = target2_compound_metadata["pert_iname"].str.lower()
target2_compound_metadata = target2_compound_metadata.drop_duplicates()
target2_compound_metadata = target2_compound_metadata.merge(
    df_target_clue,
    how="left",
    left_on="pert_iname",
    right_on="pert_iname"
)

# Have to add 'Metadata' here so that PyCytoMiner doesn't drop the cols later
target2_compound_metadata.columns = [f"Metadata_{col}" if "Metadata_" not in col else col for col in target2_compound_metadata.columns]

target2_compound_metadata

Unnamed: 0,Metadata_InChIKey_standardized,Metadata_SMILES_standardized,Metadata_InChI_standardized,Metadata_pubchem_cid,Metadata_pert_iname,Metadata_clinical_phase,Metadata_target,Metadata_disease_area,Metadata_indication,Metadata_moa
0,VMWNQDUVQKEIOC-UHFFFAOYSA-N,CN1CCc2cccc3c2C1Cc1ccc(O)c(O)c1-3,InChI=1S/C17H17NO2/c1-18-8-7-10-3-2-4-12-15(10...,6005,(r)-(-)-apomorphine,,,,,
1,HJORMJIFDVBMOB-UHFFFAOYSA-N,COc1ccc(C2CNC(=O)C2)cc1OC1CCCC1,InChI=1S/C16H21NO3/c1-19-14-7-6-11(12-9-16(18)...,448055,(r)-(-)-rolipram,,,,,
2,KPYSYYIEGFHWSV-UHFFFAOYSA-N,NCC(CC(=O)O)c1ccc(Cl)cc1,InChI=1S/C10H12ClNO2/c11-9-3-1-7(2-4-9)8(6-12)...,6918881,(r)-baclofen,,,,,
3,HJORMJIFDVBMOB-UHFFFAOYSA-N,COc1ccc(C2CNC(=O)C2)cc1OC1CCCC1,InChI=1S/C16H21NO3/c1-19-14-7-6-11(12-9-16(18)...,158758,(s)-(+)-rolipram,,,,,
4,OUPXSLGGCPUZJJ-UHFFFAOYSA-N,CC(C)CC(NC(=O)CN(C)C(=O)C(Cc1ccccc1)NC(=O)C(Cc...,InChI=1S/C64H100N18O15S/c1-38(2)34-46(57(89)74...,163829,"[sar9,met(o2)11]-substance-p",Preclinical,TACR1,,,tachykinin antagonist
...,...,...,...,...,...,...,...,...,...,...
6778,YGCODSQDUUUKIV-UHFFFAOYSA-N,N=c1[nH]c2cc(Cl)ccc2o1,InChI=1S/C7H5ClN2O/c8-4-1-2-6-5(3-4)10-7(9)11-...,6103,zoxazolamine,Phase 2,,,,myorelaxant
6779,QZWYXEBIQWJXAR-UHFFFAOYSA-N,O=C1N=C2C=CC=CN2C12Cc1ccccc1C2,InChI=1S/C15H12N2O/c18-14-15(17-8-4-3-7-13(17)...,10220323,zset1446,,,,,
6780,HGVNLRPZOWWDKD-UHFFFAOYSA-N,FC(F)c1nc2ccccc2n1-c1nc(N2CCOCC2)nc(N2CCOCC2)n1,InChI=1S/C19H21F2N7O2/c20-15(21)16-22-13-3-1-2...,11647372,zstk-474,Phase 1/Phase 2,PIK3CB|PIK3CD|PIK3CG,,,PI3K inhibitor
6781,WFPIAZLQTJBIFN-UHFFFAOYSA-N,OCCN1CCN(CCC=C2c3ccccc3Sc3ccc(Cl)cc32)CC1,InChI=1S/C22H25ClN2OS/c23-17-7-8-22-20(16-17)1...,5311507,zuclopenthixol,Launched,ADRA1A|ADRA2A|DRD1|DRD2|DRD5|HRH1|HTR2A,neurology/psychiatry,schizophrenia|bipolar disorder,dopamine receptor antagonist


In [36]:
target2_with_metadata

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,...,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI
0,source_8,A1170384,A01,J1,3683.200000,7920.000000,570.710000,565.870000,482.460000,478.170000,...,340.070000,334.370000,339.540000,341.740000,348.540000,343.390000,348.400000,JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,source_8,A1170384,A02,J1,3519.000000,7427.500000,574.880000,586.630000,488.560000,502.490000,...,330.830000,326.090000,330.720000,333.160000,340.850000,334.480000,340.000000,JCP2022_050797,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,source_8,A1170384,A03,J1,3551.500000,7493.600000,548.580000,547.250000,463.350000,461.680000,...,365.590000,358.390000,365.570000,368.440000,376.590000,369.090000,375.590000,JCP2022_050997,LPYXWGMUVRGUOY-UHFFFAOYSA-N,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,source_8,A1170384,A04,J1,3494.400000,7360.200000,561.720000,580.460000,477.120000,494.980000,...,362.750000,356.280000,362.630000,366.280000,373.050000,366.880000,374.710000,JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,source_8,A1170384,A05,J1,3625.800000,7620.300000,552.490000,576.340000,466.990000,489.870000,...,378.740000,371.830000,378.270000,380.940000,387.840000,382.240000,388.370000,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64459,source_13,CP-CC9-R7-29,P20,20221120_Run6,2735.141361,5259.052356,679.311932,572.702122,608.692202,502.787820,...,7.043682,6.922703,7.048655,7.116519,7.300999,7.128542,7.313541,JCP2022_032357,HSUGRBWQSSZJOP-UHFFFAOYSA-N,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
64460,source_13,CP-CC9-R7-29,P21,20221120_Run6,2492.848859,4878.090779,647.097909,524.178232,578.300856,458.030418,...,6.791647,6.668570,6.780820,6.862487,7.036637,6.880732,6.992730,JCP2022_047545,KXBDTLQSDKGAEB-UHFFFAOYSA-N,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
64461,source_13,CP-CC9-R7-29,P22,20221120_Run6,2701.863939,5179.042455,653.885934,579.227366,584.200512,509.318159,...,7.537031,7.396785,7.522082,7.606077,7.795566,7.614485,7.796514,JCP2022_043099,JZFPYUNJRRFVQU-UHFFFAOYSA-N,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
64462,source_13,CP-CC9-R7-29,P23,20221120_Run6,2755.432136,5216.364957,646.266520,572.009049,576.276940,501.680011,...,5.446202,5.354443,5.438343,5.490677,5.610534,5.494503,5.608818,JCP2022_050516,LNFZRMDSZJCZTG-UHFFFAOYSA-N,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...


In [37]:
target2_complete = target2_with_metadata.merge(
    target2_compound_metadata,
    left_on="Metadata_InChIKey",
    right_on="Metadata_InChIKey_standardized",
    how="left"
).drop_duplicates(subset=["Metadata_Source", "Metadata_Plate", "Metadata_Well"])

target2_complete

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,...,Metadata_InChIKey_standardized,Metadata_SMILES_standardized,Metadata_InChI_standardized,Metadata_pubchem_cid,Metadata_pert_iname,Metadata_clinical_phase,Metadata_target,Metadata_disease_area,Metadata_indication,Metadata_moa
0,source_8,A1170384,A01,J1,3683.200000,7920.000000,570.710000,565.870000,482.460000,478.170000,...,KBPLFHHGFOOTCA-UHFFFAOYSA-N,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",957,1-octanol,Phase 2,GJA1|GJA10|GJA3|GJA4|GJA5|GJA8|GJA9|GJB1|GJB2|...,,,
1,source_8,A1170384,A02,J1,3519.000000,7427.500000,574.880000,586.630000,488.560000,502.490000,...,LOUPRKONTZGTKE-UHFFFAOYSA-N,C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...,441074,quinidine,Launched,KCNA5|KCNA7|KCNH1|KCNH2|KCNH5|KCNK1|KCNK6|SCN5...,infectious disease|cardiology,malaria|atrial fibrillation (AF)|ventricular a...,sodium channel blocker
3,source_8,A1170384,A03,J1,3551.500000,7493.600000,548.580000,547.250000,463.350000,461.680000,...,LPYXWGMUVRGUOY-UHFFFAOYSA-N,OCC(O)c1oc(O)c(O)c1O,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...,0,ascorbic-acid,Launched,ALKBH2|ALKBH3|BBOX1|DBH|EGLN1|EGLN2|EGLN3|KDM5...,endocrinology,scurvy,antioxidant
4,source_8,A1170384,A04,J1,3494.400000,7360.200000,561.720000,580.460000,477.120000,494.980000,...,YGSDEFSMJLZEOE-UHFFFAOYSA-N,O=C(O)c1ccccc1O,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",53629521,bismuth-subsalicylate,Launched,,gastroenterology,diarrhea,antacid
6,source_8,A1170384,A05,J1,3625.800000,7620.300000,552.490000,576.340000,466.990000,489.870000,...,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,C[S+](C)[O-],InChI=1S/C2H6OS/c1-4(2)3/h1-2H3,21584481,dmso,Preclinical,,,,control vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67313,source_13,CP-CC9-R7-29,P20,20221120_Run6,2735.141361,5259.052356,679.311932,572.702122,608.692202,502.787820,...,HSUGRBWQSSZJOP-UHFFFAOYSA-N,COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)=O)cc1,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,39186,diltiazem,Launched,CACNA1C|CACNA1S|CACNA2D1|CACNG1|HTR3A|KCNA5,cardiology,hypertension|angina pectoris,calcium channel blocker
67314,source_13,CP-CC9-R7-29,P21,20221120_Run6,2492.848859,4878.090779,647.097909,524.178232,578.300856,458.030418,...,KXBDTLQSDKGAEB-UHFFFAOYSA-N,C=CC(=O)Nc1cccc(N=c2[nH]c(=Nc3ccc(OCCOC)cc3)[n...,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...,59174488,avl-292,Phase 2,BTK|YES1,,,Bruton's tyrosine kinase (BTK) inhibitor
67315,source_13,CP-CC9-R7-29,P22,20221120_Run6,2701.863939,5179.042455,653.885934,579.227366,584.200512,509.318159,...,JZFPYUNJRRFVQU-UHFFFAOYSA-N,O=C(O)c1ccc[nH]c1=Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",4488,niflumic-acid,Launched,ANO1|CLCN1|CLCNKA|CLCNKB|KCNQ1|PLA2G1B|PLA2G4A...,rheumatology|neurology/psychiatry,joint pain|muscle pain,cyclooxygenase inhibitor
67316,source_13,CP-CC9-R7-29,P23,20221120_Run6,2755.432136,5216.364957,646.266520,572.009049,576.276940,501.680011,...,LNFZRMDSZJCZTG-UHFFFAOYSA-N,Cc1ccc(NC(=O)Nc2cc(C(F)(F)F)ccc2F)cc1Nc1ccc2c(...,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...,59397065,gnf-5837,Preclinical,NTRK1|NTRK2|NTRK3,,,growth factor receptor inhibitor


## Merge in microscope config

In [38]:
microscope_config = pd.read_csv("../metadata/microscope_config.csv")
microscope_config["Metadata_Source"] = [f"source_{s}" for s in microscope_config.Metadata_Source.unique()]
microscope_config

Unnamed: 0,Metadata_Source,Metadata_Microscope_Name,Metadata_Widefield_vs_Confocal,Metadata_Excitation_Type,Metadata_Objective_NA,Metadata_N_Brightfield_Planes_Min,Metadata_N_Brightfield_Planes_Max,Metadata_Distance_Between_Z_Microns,Metadata_Sites_Per_Well,Metadata_Filter_Configuration
0,source_1,Opera Phenix,Widefield,Laser,1.0,1,1,,4,H
1,source_2,CV8000,Confocal,Laser,1.0,3,3,8.0,6,A
2,source_3,Opera Phenix,Widefield,Laser,1.0,0,3,5.0,9,B
3,source_4,Opera Phenix,Widefield,Laser,1.0,3,3,5.0,9,B
4,source_5,CV8000,Confocal,Laser,0.75,3,3,5.0,9,C
5,source_6,CV8000,Confocal,Laser,0.75,3,3,5.0,9,A
6,source_7,CV7000,Confocal,Laser,0.75,0,0,,9,D
7,source_8,ImageXpress Micro Confocal,Confocal,LED,0.75,0,0,3.0,9,E
8,source_9,Opera Phenix,Widefield,Laser,0.8,0,0,,4,H
9,source_10,CV8000,Confocal,Laser,0.75,3,3,5.0,6,A


In [39]:
target2_complete = target2_complete.merge(
    microscope_config,
    left_on="Metadata_Source",
    right_on="Metadata_Source",
    how="left"
)

## Save to disk

In [40]:
target2_complete.to_parquet("../data/target2_wellres_featuresimputed_druginfoadded.parquet")