In [1]:
import pandas as pd
import scanpy as sc
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.preprocessing import StandardScaler

In [2]:
# load sensitivity scores and parse compounds and dosages
logfold_changes = pd.read_csv("/vevo/umair/data/sens-pred/sens-prism-sec/logfold-changes.csv")
logfold_changes["broad_id"] = logfold_changes["condition"].apply(lambda x: x.split("::")[0])
logfold_changes["dosage"] = logfold_changes["condition"].apply(lambda x: float(x.split("::")[1]))

# compute means across replicates
logfold_changes = logfold_changes.groupby(["broad_id", "dosage", "cell_line"])["growth_rate"].mean().reset_index()
logfold_changes

Unnamed: 0,broad_id,dosage,cell_line,growth_rate
0,BRD-A00077618-236-07-6,0.00061,ACH-000007,0.155931
1,BRD-A00077618-236-07-6,0.00061,ACH-000008,0.184596
2,BRD-A00077618-236-07-6,0.00061,ACH-000011,-0.129952
3,BRD-A00077618-236-07-6,0.00061,ACH-000012,0.264335
4,BRD-A00077618-236-07-6,0.00061,ACH-000013,0.284202
...,...,...,...,...
6039912,BRD-U45393375-000-01-6,10.00000,ACH-001239,0.187106
6039913,BRD-U45393375-000-01-6,10.00000,ACH-001306,1.033793
6039914,BRD-U45393375-000-01-6,10.00000,ACH-001307,0.429469
6039915,BRD-U45393375-000-01-6,10.00000,ACH-001318,0.417842


In [3]:
# load compound information and fix SMILES strings
smiles_df = pd.read_csv("/vevo/umair/data/sens-pred/sens-prism-sec/secondary-screen-replicate-collapsed-treatment-info.csv")[["broad_id", "smiles"]].drop_duplicates()
smiles_df["smiles"] = smiles_df["smiles"].apply(lambda x: x.split(", ")[0])

# function to convert SMILES to fingerprints
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = mfpgen.GetFingerprintAsNumPy(mol)
    return fp

# create new column with fingerprint
smiles_df["morgan_fp"] = smiles_df["smiles"].apply(smiles_to_morgan)
smiles_df

Unnamed: 0,broad_id,smiles,morgan_fp
0,BRD-A25234499-001-19-1,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,BRD-A70858459-001-01-7,C[C@]12CCC3C(CCc4cc(OC(=O)N(CCCl)CCCl)ccc34)C1...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16,BRD-A74914197-001-02-9,Nc1nc(N)c2nc(CC(CC#C)c3ccc(cc3)C(=O)N[C@@H](CC...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24,BRD-K02113016-001-19-6,Fc1ccc(Cc2n[nH]c(=O)c3ccccc23)cc1C(=O)N1CCN(CC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
32,BRD-K02130563-001-11-4,Cc1[nH]c2ccccc2c1CCNCc1ccc(\C=C\C(=O)NO)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
12936,BRD-K91543828-001-02-0,OC[C@H]1O[C@H](C[C@@H]1O)n1cnc2[C@H](O)CNC=Nc12,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12960,BRD-K96344439-002-02-7,CCc1ccc(Cc2ccc3CO[C@]4(O[C@H](CO)[C@@H](O)[C@H...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12984,BRD-K99879819-001-02-1,C[C@H]1Oc2cc(cnc2N)-c2c(CN(C)C(=O)c3ccc(F)cc13...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12992,BRD-K06519765-065-01-6,CC[C@]1(O)C[C@@H]2C[N@](C1)CCc1c([nH]c3ccccc13...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [4]:
# merge DataFrames
df = logfold_changes.merge(smiles_df, on="broad_id", how="left")
df

Unnamed: 0,broad_id,dosage,cell_line,growth_rate,smiles,morgan_fp
0,BRD-A00077618-236-07-6,0.00061,ACH-000007,0.155931,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,BRD-A00077618-236-07-6,0.00061,ACH-000008,0.184596,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,BRD-A00077618-236-07-6,0.00061,ACH-000011,-0.129952,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,BRD-A00077618-236-07-6,0.00061,ACH-000012,0.264335,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,BRD-A00077618-236-07-6,0.00061,ACH-000013,0.284202,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
6039912,BRD-U45393375-000-01-6,10.00000,ACH-001239,0.187106,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039913,BRD-U45393375-000-01-6,10.00000,ACH-001306,1.033793,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039914,BRD-U45393375-000-01-6,10.00000,ACH-001307,0.429469,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039915,BRD-U45393375-000-01-6,10.00000,ACH-001318,0.417842,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
# subset to valid cell lines
valid_cell_lines = sc.read_h5ad("/vevo/umair/data/sens-pred/embs/ccle.h5ad").obs["ModelID"].tolist()
df = df[df["cell_line"].isin(valid_cell_lines)]
df

Unnamed: 0,broad_id,dosage,cell_line,growth_rate,smiles,morgan_fp
0,BRD-A00077618-236-07-6,0.00061,ACH-000007,0.155931,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,BRD-A00077618-236-07-6,0.00061,ACH-000011,-0.129952,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,BRD-A00077618-236-07-6,0.00061,ACH-000012,0.264335,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,BRD-A00077618-236-07-6,0.00061,ACH-000013,0.284202,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,BRD-A00077618-236-07-6,0.00061,ACH-000015,-0.081515,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
6039911,BRD-U45393375-000-01-6,10.00000,ACH-001192,,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039912,BRD-U45393375-000-01-6,10.00000,ACH-001239,0.187106,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039913,BRD-U45393375-000-01-6,10.00000,ACH-001306,1.033793,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039914,BRD-U45393375-000-01-6,10.00000,ACH-001307,0.429469,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
# subset to valid growth rates
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,broad_id,dosage,cell_line,growth_rate,smiles,morgan_fp
0,BRD-A00077618-236-07-6,0.00061,ACH-000007,0.155931,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,BRD-A00077618-236-07-6,0.00061,ACH-000011,-0.129952,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,BRD-A00077618-236-07-6,0.00061,ACH-000012,0.264335,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,BRD-A00077618-236-07-6,0.00061,ACH-000013,0.284202,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,BRD-A00077618-236-07-6,0.00061,ACH-000015,-0.081515,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
4508093,BRD-U45393375-000-01-6,10.00000,ACH-001128,0.788063,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508094,BRD-U45393375-000-01-6,10.00000,ACH-001239,0.187106,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508095,BRD-U45393375-000-01-6,10.00000,ACH-001306,1.033793,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508096,BRD-U45393375-000-01-6,10.00000,ACH-001307,0.429469,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
# normalize dosages
scaler = StandardScaler()
df["dosage"] = scaler.fit_transform(df[["dosage"]])
df

Unnamed: 0,broad_id,dosage,cell_line,growth_rate,smiles,morgan_fp
0,BRD-A00077618-236-07-6,-0.500688,ACH-000007,0.155931,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,BRD-A00077618-236-07-6,-0.500688,ACH-000011,-0.129952,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,BRD-A00077618-236-07-6,-0.500688,ACH-000012,0.264335,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,BRD-A00077618-236-07-6,-0.500688,ACH-000013,0.284202,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,BRD-A00077618-236-07-6,-0.500688,ACH-000015,-0.081515,Nc1nc(O)c2nc(Br)n([C@@H]3O[C@@H]4COP(O)(=O)O[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
4508093,BRD-U45393375-000-01-6,2.625970,ACH-001128,0.788063,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508094,BRD-U45393375-000-01-6,2.625970,ACH-001239,0.187106,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508095,BRD-U45393375-000-01-6,2.625970,ACH-001306,1.033793,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4508096,BRD-U45393375-000-01-6,2.625970,ACH-001307,0.429469,NCC=C.ClCC1CO1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
# save DataFrame
df.to_pickle("/vevo/umair/data/sens-pred/mlp-data/dataset.pkl")