In [130]:
import pandas as pd
import numpy as np
import os
import cv2
import tqdm
import glob

In [131]:
 moas = ['Aurora kinase inhibitor', 'tubulin polymerization inhibitor', 'JAK inhibitor', 'protein synthesis inhibitor', 'HDAC inhibitor', 
        'topoisomerase inhibitor', 'PARP inhibitor', 'ATPase inhibitor', 'retinoid receptor agonist', 'HSP inhibitor']

In [132]:
# Download compounds that were used to perturb cells for HIC from first batch that comes from PharmBio Lab
compounds = pd.read_csv('/home/jovyan/scratch-shared/erik/from_phil/specs935-v1-compounds.csv', sep=',')
compounds.shape

(935, 39)

In [133]:
# Download all the compounds that exist on CLUE --> gene expression data from LINCS
# from https://clue.io/
clue_compounds = pd.read_csv('/home/jovyan/scratch-shared/erik/from_phil/clue_compoundinfo_beta.txt', delimiter = "\t")

## Clue Compounds

In [134]:
clue_compounds

Unnamed: 0,pert_id,cmap_name,target,moa,canonical_smiles,inchi_key,compound_aliases
0,BRD-A08715367,L-theanine,,,CCNC(=O)CCC(N)C(O)=O,DATAGRPVKZEWHA-UHFFFAOYSA-N,l-theanine
1,BRD-A12237696,L-citrulline,,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N,l-citrulline
2,BRD-A18795974,BRD-A18795974,,,CCCN(CCC)C1CCc2ccc(O)cc2C1,BLYMJBIZMIGWFK-UHFFFAOYSA-N,7-hydroxy-DPAT
3,BRD-A27924917,BRD-A27924917,,,NCC(O)(CS(O)(=O)=O)c1ccc(Cl)cc1,WBSMZVIMANOCNX-UHFFFAOYSA-N,2-hydroxysaclofen
4,BRD-A35931254,BRD-A35931254,,,CN1CCc2cccc-3c2C1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-UHFFFAOYSA-N,r(-)-apomorphine
...,...,...,...,...,...,...,...
39316,BRD-K62685538,triptorelin,GNRHR,Gonadotropin releasing factor hormone receptor...,CC(C)C[C@H](NC(=O)[C@@H](Cc1c[nH]c2ccccc12)NC(...,VXKHXGOKWPXYNA-PGBVPBMZSA-N,
39317,BRD-K62221994,T-98475,GNRHR,Gonadotropin releasing factor hormone receptor...,CC(C)OC(=O)c1cn(Cc2c(F)cccc2F)c3sc(c(CN(C)Cc4c...,RANJJVIMTOIWIN-UHFFFAOYSA-N,
39318,BRD-K53397409,benzoic-acid,RAB9A,"Precursor for food preservatives, plasticizers...",OC(=O)c1ccccc1,WPYMKLBDIGXBTP-UHFFFAOYSA-N,
39319,BRD-A62182663,YK-4279,DHX9,Binding of RNA helicase A to the transcription...,COc1ccc(cc1)C(=O)CC1(O)C(=O)Nc2c1c(Cl)ccc2Cl,HLXSCTYHLQHQDJ-UHFFFAOYSA-N,


In [135]:
clue_compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39321 entries, 0 to 39320
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pert_id           39321 non-null  object
 1   cmap_name         39321 non-null  object
 2   target            8046 non-null   object
 3   moa               8046 non-null   object
 4   canonical_smiles  33531 non-null  object
 5   inchi_key         26838 non-null  object
 6   compound_aliases  855 non-null    object
dtypes: object(7)
memory usage: 2.1+ MB


Looks like 39321 different molecules, but we only have 8046 with an identifiable MoA

In [136]:
# Remove all the BRD from the unique code to identify common compounds that the Broad Institute places on there.
clue_compounds['pert_id'] =  clue_compounds['pert_id'].map(lambda x: x.lstrip('BRD-'))

In [137]:
clue_compounds[0:10]

Unnamed: 0,pert_id,cmap_name,target,moa,canonical_smiles,inchi_key,compound_aliases
0,A08715367,L-theanine,,,CCNC(=O)CCC(N)C(O)=O,DATAGRPVKZEWHA-UHFFFAOYSA-N,l-theanine
1,A12237696,L-citrulline,,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N,l-citrulline
2,A18795974,BRD-A18795974,,,CCCN(CCC)C1CCc2ccc(O)cc2C1,BLYMJBIZMIGWFK-UHFFFAOYSA-N,7-hydroxy-DPAT
3,A27924917,BRD-A27924917,,,NCC(O)(CS(O)(=O)=O)c1ccc(Cl)cc1,WBSMZVIMANOCNX-UHFFFAOYSA-N,2-hydroxysaclofen
4,A35931254,BRD-A35931254,,,CN1CCc2cccc-3c2C1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-UHFFFAOYSA-N,r(-)-apomorphine
5,A39230911,chlorphensin,,,NC(=O)OCC(O)COc1ccc(Cl)cc1,SKPLBLUECSEIFO-UHFFFAOYSA-N,chlorphenesin-carbamate
6,A77577770,BRD-A77577770,,,CCCCCCCCCCCCCCCC(=O)OC(CC(O)=O)C[N+](C)(C)C,XOMRRQXKHMYMOC-UHFFFAOYSA-O,palmitoylcarnitine
7,A86415025,BRD-A86415025,,,C(C(N1CCCCC1)c1ccccc1)c1ccccc1,JQWJJJYHVHNXJH-UHFFFAOYSA-N,"1-(1,2-diphenylethyl)piperidine-(+/-)"
8,K05674516,PSI-7976,,,CC(C)OC(=O)[C@H](C)N[P@@](=O)(OC[C@H]1O[C@@H](...,TTZHDVOVKQGIBA-YBSJRAAASA-N,sofosbuvir
9,K10673031,S-isopropylisothiourea,,,CC(C)SC(N)=N,XSSNABKEYXKKMK-UHFFFAOYSA-N,s-isopropylisothiourea


In [138]:
clue_compounds = list(clue_compounds['pert_id'])

## SPECS V1 only Compounds

In [139]:
# If the ID in compounds is in the list of clue compounds, include it in a new data frame. 
compounds_wge = compounds[compounds.CUSTOMER_ID.isin(clue_compounds)].reset_index(drop=True)

In [140]:
compounds_wge

Unnamed: 0,Library,Compound ID,Batch nr,CUSTOMER_ID,MOLFORMULA,MOLWEIGHT,NAME,VENDOR,ADD_INFO,SMILES,...,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,TOX21_ID,DILIST_ID,DILIst Classification,Routs of Administration,selected_mechanism of action (MoA),selected_mechanism
0,SPECS,CBK290537,BJ1898259,A39052811,C21H25ClFN3O3.C6H8O7,614.03,Mosapride citrate,Selleck Chemicals,112885-42-4,CCOc1cc(N)c(Cl)cc1C(=O)NCC1CN(Cc2ccc(F)cc2)CCO1,...,119583.0,144205427.0,A,A,Tox21_111874,,,,serotonin receptor agonist,aryl hydrocarbon receptor agonist
1,SPECS,CBK200938,BJ1897571,A84465106,C20H24N2O6,388.42,Nisoldipine,MedChemExpress,"CAS 63675-72-9, target Calcium Channel",O=C(C1=C(C)NC(C)=C(C(OCC(C)C)=O)C1C2=CC=CC=C2[...,...,4499.0,144205804.0,F,Fc,Tox21_112251,,,,calcium channel blocker,aryl hydrocarbon receptor agonist
2,SPECS,CBK290948,BJ1899032,K05977823,C20H23N3O2S,369.49,Tenovin-1,Selleck Chemicals,380315-80-0,CC(=O)Nc1ccc(NC(=S)NC(=O)c2ccc(cc2)C(C)(C)C)cc1,...,1013376.0,144206362.0,W,,Tox21_112809,,,,SIRT inhibitor|TP53 activator,aryl hydrocarbon receptor agonist
3,SPECS,CBK200855,BJ1895358,K09255212,C9H5ClINO,305.50,CLIOQUINOL,Microsource Discovery Systems,130-26-7,Oc1c(I)cc(Cl)c2cccnc12,...,2788.0,144203967.0,A,A,Tox21_110416,,,,chelating agent,aryl hydrocarbon receptor agonist
4,SPECS,CBK015802,BJ1895142,K17075857,C9H5Cl2NO,214.05,CHLOROXINE,Microsource Discovery Systems,773-76-2,Oc1c(Cl)cc(Cl)c2cccnc12,...,2722.0,144205047.0,A,A,Tox21_111494,,,,opioid receptor antagonist,aryl hydrocarbon receptor agonist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,SPECS,CBK291076,BJ1895025,K30126819,C4H8O6S4.2Na,326.34,DIMESNA,Microsource Discovery Systems,16208-51-8,OS(=O)(=O)CCSSCCS(O)(=O)=O,...,,,,,,,,,tubulin polymerization inhibitor,tubulin polymerization inhibitor
640,SPECS,CBK307747,BJ1894364,K61195623,C18H20O5,316.36,Combretastatin-A4,Axon Medchem BV,CAS 117048-59-6,COc1ccc(\C=C/c2cc(OC)c(OC)c(OC)c2)cc1O,...,,,,,,,,,tubulin polymerization inhibitor,tubulin polymerization inhibitor
641,SPECS,CBK307964,BJ1895783,K59753975,C43H55N5O7.H2SO4,852.03,Vindesine Sulfate,Avachem Scientific,59917-39-4,CC[C@]1(O)C[C@H]2CN(C1)CCc1c([nH]c3ccccc13)[C@...,...,,,,,,,,,tubulin polymerization inhibitor,tubulin polymerization inhibitor
642,SPECS,CBK309391,BJ1897986,K78567475,C42H68N6O6S,785.11,Dolastin-10,Shanghai PI Chemicals Ltd,NotKnown,CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@H]...,...,,,,,,,,,tubulin polymerization inhibitor,tubulin polymerization inhibitor


 Means that there are 644 compounds that have induced gene expression profiles and HCI in SPECS V1

In [141]:
compounds_wge[:5]

Unnamed: 0,Library,Compound ID,Batch nr,CUSTOMER_ID,MOLFORMULA,MOLWEIGHT,NAME,VENDOR,ADD_INFO,SMILES,...,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,TOX21_ID,DILIST_ID,DILIst Classification,Routs of Administration,selected_mechanism of action (MoA),selected_mechanism
0,SPECS,CBK290537,BJ1898259,A39052811,C21H25ClFN3O3.C6H8O7,614.03,Mosapride citrate,Selleck Chemicals,112885-42-4,CCOc1cc(N)c(Cl)cc1C(=O)NCC1CN(Cc2ccc(F)cc2)CCO1,...,119583.0,144205427.0,A,A,Tox21_111874,,,,serotonin receptor agonist,aryl hydrocarbon receptor agonist
1,SPECS,CBK200938,BJ1897571,A84465106,C20H24N2O6,388.42,Nisoldipine,MedChemExpress,"CAS 63675-72-9, target Calcium Channel",O=C(C1=C(C)NC(C)=C(C(OCC(C)C)=O)C1C2=CC=CC=C2[...,...,4499.0,144205804.0,F,Fc,Tox21_112251,,,,calcium channel blocker,aryl hydrocarbon receptor agonist
2,SPECS,CBK290948,BJ1899032,K05977823,C20H23N3O2S,369.49,Tenovin-1,Selleck Chemicals,380315-80-0,CC(=O)Nc1ccc(NC(=S)NC(=O)c2ccc(cc2)C(C)(C)C)cc1,...,1013376.0,144206362.0,W,,Tox21_112809,,,,SIRT inhibitor|TP53 activator,aryl hydrocarbon receptor agonist
3,SPECS,CBK200855,BJ1895358,K09255212,C9H5ClINO,305.5,CLIOQUINOL,Microsource Discovery Systems,130-26-7,Oc1c(I)cc(Cl)c2cccnc12,...,2788.0,144203967.0,A,A,Tox21_110416,,,,chelating agent,aryl hydrocarbon receptor agonist
4,SPECS,CBK015802,BJ1895142,K17075857,C9H5Cl2NO,214.05,CHLOROXINE,Microsource Discovery Systems,773-76-2,Oc1c(Cl)cc(Cl)c2cccnc12,...,2722.0,144205047.0,A,A,Tox21_111494,,,,opioid receptor antagonist,aryl hydrocarbon receptor agonist


In [142]:
# For all the compounds in the original chosen group, see how many compounds with the same MOA 
# we have in each of our two data frames.
compounds[compounds.moa.isin(moas)].moa.value_counts()

HDAC inhibitor                      33
topoisomerase inhibitor             32
HSP inhibitor                       24
protein synthesis inhibitor         23
JAK inhibitor                       22
PARP inhibitor                      21
Aurora kinase inhibitor             20
tubulin polymerization inhibitor    20
retinoid receptor agonist           20
ATPase inhibitor                    19
Name: moa, dtype: int64

In [143]:
# For all the compounds where the compounds are identical for creating morphological and transcriptomic profiles,
# see how many compounds with the same MOA we have in each of our two data frames.
compounds_wge[compounds_wge.moa.isin(moas)].moa.value_counts()

HDAC inhibitor                      25
topoisomerase inhibitor             23
PARP inhibitor                      18
tubulin polymerization inhibitor    16
JAK inhibitor                       15
retinoid receptor agonist           14
HSP inhibitor                       13
Aurora kinase inhibitor             12
ATPase inhibitor                    12
protein synthesis inhibitor          8
Name: moa, dtype: int64

# Compounds from Specs1K-v2

In [144]:
# Download compounds that were used to perturb cells for HIC from second batch that comes from PharmBio Lab
compounds2 = pd.read_csv('/home/jovyan/scratch-shared/erik/from_phil/SPECS1K-v2.csv', sep=',')

In [145]:
compounds2[0:5]

Unnamed: 0,Compound ID,Batch nr,CUSTOMER_ID,MOLFORMULA,MOLWEIGHT,NAME_x,VENDOR,ADD_INFO,SMILES_x,IUPAC_NAME,...,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,NAME_y,SMILES_y,TOX21_ID,Selected_mechanism of action (MoA),selected_mechanism
0,CBK042036,BJ1899146,K95309561,C18H18O2,266.34,dienestrol,TargetMol,CAS 84-17-3,C\C=C(c1ccc(O)cc1)\C(c1ccc(O)cc1)=C\C,"4-[(E,1Z)-1-ethylidene-2-(4-hydroxyphenyl)but-...",...,84-17-3,667476.0,170465362.0,A,,Dienestrol,Oc1ccc(cc1)C(=C\C)/C(=C/C)c2ccc(O)cc2,Tox21_110378_1,estrogen receptor agonist,agonists of the antioxidant response element (...
1,CBK290570,BJ1895339,K92428153,C23H31NO7,433.51,MYCOPHENOLATE MOFETIL,Microsource Discovery Systems,115007-34-6,COc1c(C)c2COC(=O)c2c(O)c1C\C=C(/C)CCC(=O)OCCN1...,2-morpholinoethyl (E)-6-(4-hydroxy-6-methoxy-7...,...,128794-94-5,5281078.0,170464859.0,A,,Mycophenolate mofetil,Oc3c1C(=O)OCc1c(C)c(OC)c3C\C=C(/C)CCC(=O)OCCN2...,Tox21_111686_1,dehydrogenase inhibitor|inositol monophosphata...,agonists of the antioxidant response element (...
2,CBK307944,BJ1895660,K82908348,C23H20N2O4S,420.49,darglitazone,Sanbio BV,CAS 141200-24-0,O=C(CCC1=C(C)OC(C2=CC=CC=C2)=N1)C3=CC=C(CC4SC(...,5-[[4-[3-(5-methyl-2-phenyl-oxazol-4-yl)propan...,...,141200-24-0,60870.0,170466177.0,A,A,Darglitazone,Cc3oc(nc3CCC(=O)c2ccc(CC1SC(=O)NC1=O)cc2)c4ccccc4,Tox21_113876,PPAR receptor antagonist,agonists of the antioxidant response element (...
3,CBK016703,BJ1894591,K82236179,C19H12O6,336.3,DICUMAROL,Microsource Discovery Systems,66-76-2 (acid),Oc1c(Cc2c(O)c3ccccc3oc2=O)c(=O)oc2ccccc12,4-hydroxy-3-[(4-hydroxy-2-oxo-chromen-3-yl)met...,...,66-76-2,54676038.0,170465275.0,F,,Dicumarol,OC=3c4ccccc4OC(=O)C=3CC1=C(O)c2ccccc2OC1=O,Tox21_110357_1,NADPH inhibitor,agonists of the antioxidant response element (...
4,CBK011717,BJ1894651,K82103381,C8H8N4.HCl,196.64,HYDRALAZINE HYDROCHLORIDE,Microsource Discovery Systems,86-54-4,NNc1nncc2ccccc12,phthalazin-1-ylhydrazine,...,304-20-1,9351.0,144212813.0,A,A,Hydralazine hydrochloride,Cl.NNc2nncc1ccccc12,Tox21_302496,vasodilator,agonists of the antioxidant response element (...


In [146]:
compounds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 37 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Compound ID                         999 non-null    object 
 1   Batch nr                            999 non-null    object 
 2   CUSTOMER_ID                         999 non-null    object 
 3   MOLFORMULA                          999 non-null    object 
 4   MOLWEIGHT                           999 non-null    float64
 5   NAME_x                              995 non-null    object 
 6   VENDOR                              999 non-null    object 
 7   ADD_INFO                            875 non-null    object 
 8   SMILES_x                            999 non-null    object 
 9   IUPAC_NAME                          993 non-null    object 
 10  STEREOCHEMISTRY                     251 non-null    object 
 11  pert_iname                          925 non-n

In [147]:
compounds2.shape

(999, 37)

In [148]:
compounds2.NAME_x[:500]

0                         dienestrol
1              MYCOPHENOLATE MOFETIL
2                       darglitazone
3                          DICUMAROL
4          HYDRALAZINE HYDROCHLORIDE
                   ...              
495    Tenofovir Disoproxil Fumarate
496                     ZOLMITRIPTAN
497               CEFMETAZOLE SODIUM
498                  Propentofylline
499           TIAPRIDE HYDROCHLORIDE
Name: NAME_x, Length: 500, dtype: object

In [149]:
compounds2.NAME_y

0                     Dienestrol
1          Mycophenolate mofetil
2                   Darglitazone
3                      Dicumarol
4      Hydralazine hydrochloride
                 ...            
994                          NaN
995                          NaN
996                          NaN
997                          NaN
998                          NaN
Name: NAME_y, Length: 999, dtype: object

In [150]:
compounds2.pert_iname[:500]

0                 dienestrol
1      mycophenolate-mofetil
2               darglitazone
3                 dicoumarol
4                hydralazine
               ...          
495     tenofovir-disoproxil
496             zolmitriptan
497              cefmetazole
498          propentofylline
499                 tiapride
Name: pert_iname, Length: 500, dtype: object

In [151]:
compounds2.IUPAC_NAME[:50]

0     4-[(E,1Z)-1-ethylidene-2-(4-hydroxyphenyl)but-...
1     2-morpholinoethyl (E)-6-(4-hydroxy-6-methoxy-7...
2     5-[[4-[3-(5-methyl-2-phenyl-oxazol-4-yl)propan...
3     4-hydroxy-3-[(4-hydroxy-2-oxo-chromen-3-yl)met...
4                              phthalazin-1-ylhydrazine
5             [(E)-(5-nitro-2-furyl)methyleneamino]urea
6     1-[(2,4-Dichlorophenyl)methyl]-1H-indazole-3-c...
7     1-[(E)-(5-nitro-2-furyl)methyleneamino]imidazo...
8                                  3,5-dinitrobenzamide
9                 3-[(E)-phenylazo]pyridine-2,6-diamine
10    5-fluoro-1-[(4R,5R)-4-hydroxy-5-(hydroxymethyl...
11    (E)-N-[(4-hydroxy-3-methoxy-phenyl)methyl]-8-m...
12    2-[3-[4-(3-chlorophenyl)piperazin-1-yl]propyl]...
13    N-(4-Chlorophenyl)-4-(4-pyridinylmethyl)-1-pht...
14           4-[1-ethyl-2-(4-hydroxyphenyl)butyl]phenol
15    4-[3-(difluoromethyl)-5-(3-fluoro-4-methoxy-ph...
16                    4-(dichlorosulfamoyl)benzoic acid
17    3-[(3,5-Dimethyl-1H-pyrrol-2-yl)methylene]

In [152]:
compounds2.NAME_y[:100]

0                        Dienestrol
1             Mycophenolate mofetil
2                      Darglitazone
3                         Dicumarol
4         Hydralazine hydrochloride
                  ...              
95                     Lubiprostone
96                Megestrol acetate
97                      Trepibutone
98    Trifluoperazine hydrochloride
99                  Monensin sodium
Name: NAME_y, Length: 100, dtype: object

compounds2.IUPAC_NAME[:50]# Combining Specs1K-v2 and Specs935compounds2.pert_iname[:50]

In [182]:
# checking to see the different names of the columns. Some differences occur which could be fixed (notice x with SMILES for example), but does not effect following
for col1,col2 in zip(compounds.columns[1:], compounds2.columns):
    print((col1,col2))

('Compound ID', 'Compound ID')
('Batch nr', 'Batch nr')
('CUSTOMER_ID', 'CUSTOMER_ID')
('MOLFORMULA', 'MOLFORMULA')
('MOLWEIGHT', 'MOLWEIGHT')
('NAME', 'NAME_x')
('VENDOR', 'VENDOR')
('ADD_INFO', 'ADD_INFO')
('SMILES', 'SMILES_x')
('IUPAC_NAME', 'IUPAC_NAME')
('STEREOCHEMISTRY', 'STEREOCHEMISTRY')
('pert_iname', 'pert_iname')
('clinical_phase', 'clinical_phase')
('moa', 'moa')
('target', 'target')
('disease_area', 'disease_area')
('indication', 'indication')
('SAMPLE_ID', 'SAMPLE_ID')
('PROTOCOL_NAME', 'PROTOCOL_NAME')
('SAMPLE_DATA_TYPE', 'SAMPLE_DATA_TYPE')
('ASSAY_OUTCOME', 'ASSAY_OUTCOME')
('CHANNEL_OUTCOME', 'CHANNEL_OUTCOME')
('AC50', 'AC50')
('EFFICACY', 'EFFICACY')
('REPRODUCIBILITY', 'REPRODUCIBILITY')
('CURVE_RANK', 'CURVE_RANK')
('FLAG', 'FLAG')
('CAS', 'CAS')
('PUBCHEM_CID', 'PUBCHEM_CID')
('PUBCHEM_SID', 'PUBCHEM_SID')
('PURITY_RATING', 'PURITY_RATING')
('PURITY_RATING_4M', 'PURITY_RATING_4M')
('TOX21_ID', 'NAME_y')
('DILIST_ID', 'SMILES_y')
('DILIst Classification', 'TOX2

In [183]:
a = compounds[["CUSTOMER_ID", "Batch nr", "Compound ID", "moa", "NAME", "pert_iname", "SMILES"]]
b = compounds2[["CUSTOMER_ID", "Batch nr", "Compound ID", "moa", "NAME_x","NAME_y", "pert_iname", "SMILES_x", "SMILES_y"]]

In [184]:
# Concatenating both specs into a large dataframe
frames = [a, b]
compounds1_2 = pd.concat(frames)
compounds1_2.shape

(1934, 11)

Shape looks reasonable. Compounds1 (935) + Compounds2 (999) = 1934

## Looking at Proteomics Data

In [185]:
# Download compounds that were used to perturb cells for HIC from first batch that comes from PharmBio Lab
proteomics = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/06_Proteomics_Models/bro.csv', sep=';')

In [186]:
proteomics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 875 entries, 0 to 874
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Compound Name            875 non-null    object
 1   Primary Target           875 non-null    object
 2   Secondary Target         363 non-null    object
 3   SMILES                   875 non-null    object
 4   MW                       875 non-null    object
 5   Formula                  875 non-null    object
 6   Screening Concentration  875 non-null    object
dtypes: object(7)
memory usage: 48.0+ KB


In [187]:
proteomics[:50]

Unnamed: 0,Compound Name,Primary Target,Secondary Target,SMILES,MW,Formula,Screening Concentration
0,SR 142948,NTSR1,NTSR2,COc1cccc(c1c1cc(nn1c1ccc(cc1C(C)C)C(=O)N(CCCN(...,68538,C39H51N5O6,10uM
1,UK 356618,MMP3,MMP13;MMP9,ONC(=O)C[C@H](C(=O)N[C@@H](C(C)(C)C)C(=O)N[C@@...,55732,C34H43N3O4,10uM
2,JW 480,NCEH1,,O=C(Oc1ccccc1C(C)C)NCCc1ccc2c(c1)cccc2,33317,C22H23NO2,10uM
3,ML-265,PKM,,Nc1cccc(c1)Cn1ncc2c(c1=O)n(C)c1c2sc(c1)S(=O)C,37207,C17H16N4O2S2,10uM
4,PF-04418948,PTGER2,,COc1ccc2c(c1)ccc(c2)OCC1(CN(C1)C(=O)c1ccc(cc1)...,40913,C23H20FNO5,10uM
5,Ezatiostat,GSTP1,,CCOC(=O)[C@H](CCC(=O)N[C@H](C(=O)N[C@H](c1cccc...,52922,C27H35N3O6S,10uM
6,Nitisinone,HPD,,O=C1CCCC(=O)C1C(=O)c1ccc(cc1[N+](=O)[O-])C(F)(F)F,32905,C14H10F3NO5,10uM
7,"4,4'-Dichlorobenzil",CES1,CES5A;CES3;CES2,O=C(C(=O)c1ccc(cc1)Cl)c1ccc(cc1)Cl,27799,C14H8Cl2O2,10uM
8,Pentadecanoyl EA,GPR55,,CCCCCCCCCCCCCCC(=O)NCCO,28527,C17H35NO2,10uM
9,Ascomycin,FKBP1A,,CC[C@@H]1/C=C(\C)/C[C@H](C)C[C@H](OC)[C@H]2O[C...,79148,C43H69NO12,10uM


In [191]:
proteomics_SPECS1 = proteomics[proteomics["SMILES"].isin(compounds1_2["SMILES_x"])|
                               proteomics["SMILES"].isin(compounds1_2["SMILES_y"])].reset_index(drop=True)  


In [192]:
# If the ID in compounds is in the list of clue compounds, include it in a new data frame. 
proteomics_SPECS = proteomics[proteomics["Compound Name"].isin(compounds1_2["NAME_x"])|
                              proteomics["Compound Name"].isin(compounds1_2["NAME_y"])|
                              proteomics["Compound Name"].isin(compounds1_2["pert_iname"])].reset_index(drop=True)  


In [193]:
proteomics_SPECS1

Unnamed: 0,Compound Name,Primary Target,Secondary Target,SMILES,MW,Formula,Screening Concentration
0,PX-12,TXNRD1,,CCC(SSC1=NC=CN1)C,18804,C7H12N2S2,10uM
1,Seratrodast,TBXA2R,,CC(C1=O)=C(C(C(C)=C1C)=O)C(C2=CC=CC=C2)CCCCCC(...,35418,C22H26O4,10uM
2,Testolactone,CYP19A1,,O=C1CC[C@]2([H])[C@@](CC[C@]3([H])[C@@]4(C)C=C...,30017,C19H24O3,10uM


Using SMILE Strings does not work (we get three hits instead of 50). However, those hits generate unique compounds not found in the 50, which gives me some hope that more compounds could be overlapping the SPECS 1 and 2.

In [161]:
proteomics_SPECS.shape

(50, 8)

In [162]:
P100 = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/06_Proteomics_Models/GSE101406_Broad_LINCS_P100_inst_info.txt', delimiter = "\t")


In [163]:
P100['pert_id'] =  P100['pert_id'].map(lambda x: x.lstrip('BRD-'))

In [164]:
P100 = P100.drop_duplicates(subset=['pert_id'])

In [165]:
P100.shape

(90, 12)

In [166]:
P100

Unnamed: 0,inst_id,cell_id,det_plate,det_well,lsm_id,pert_dose,pert_dose_unit,pert_id,pert_time,pert_time_unit,pert_iname,pert_type
0,PA5-11373-001A01,A375,P-0022,A1,LSM-36361,0.0,uM,K08970894,3,h,DMSO,ctl_vehicle
54,PA5-11373-004A04,A375,P-0022,A4,LSM-2135,3.0,uM,K64634304,3,h,tretinoin,trt_cp
72,PA5-11373-007A07,A375,P-0022,A7,LSM-42755,10.0,uM,A81177136,3,h,KN-62,trt_cp
90,PA5-11373-010A10,A375,P-0022,A10,LSM-1141,1.0,uM,K87737963,3,h,CYT387,trt_cp
106,PA5-11373-013B01,A375,P-0022,B1,LSM-6348,5.0,uM,K37798499,3,h,etoposide,trt_cp
...,...,...,...,...,...,...,...,...,...,...,...,...
1624,PA5-EC63-076G04,A375,P-0017,G4,LSM-4571,10.0,uM,K26818574,3,h,BIX-01294,trt_cp
1642,PA5-EC63-079G07,A375,P-0017,G7,LSM-6338,1.0,uM,K83189926,3,h,UNC-1215,trt_cp
1660,PA5-EC63-085H01,A375,P-0017,H1,LSM-36364,3.0,uM,A65730376,3,h,epz004777,trt_cp
1678,PA5-EC63-091H07,A375,P-0017,H7,LSM-36369,12.0,uM,K90860366,3,h,1271738-62-5,trt_cp


In [167]:
GCP = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/06_Proteomics_Models/GSE101406_Broad_LINCS_GCP_inst_info 2.txt', delimiter = "\t")


In [168]:
GCP['pert_id'] =  GCP['pert_id'].map(lambda x: x.lstrip('BRD-'))

In [169]:
GCP.shape

(1721, 12)

In [170]:
GCP = GCP.drop_duplicates(subset=['pert_id'])

In [171]:
GCP.shape

(90, 12)

In [172]:
GCP

Unnamed: 0,inst_id,cell_id,det_plate,det_well,lsm_id,pert_dose,pert_dose_unit,pert_id,pert_time,pert_time_unit,pert_iname,pert_type
0,GA5-11373-001A01,A375,G-0022,A1,LSM-36361,0.00,uM,K08970894,24,h,DMSO,ctl_vehicle
54,GA5-11373-004A04,A375,G-0022,A4,LSM-2135,3.00,uM,K64634304,24,h,tretinoin,trt_cp
72,GA5-11373-007A07,A375,G-0022,A7,LSM-42755,10.00,uM,A81177136,24,h,KN-62,trt_cp
90,GA5-11373-010A10,A375,G-0022,A10,LSM-1141,1.00,uM,K87737963,24,h,CYT387,trt_cp
108,GA5-11373-013B01,A375,G-0022,B1,LSM-6348,5.00,uM,K37798499,24,h,etoposide,trt_cp
...,...,...,...,...,...,...,...,...,...,...,...,...
1634,GA5-421DF-079G07,A375,G-0028,G7,LSM-42768,2.50,uM,K25412176,24,h,GSK-2110183,trt_cp
1652,GA5-421DF-085H01,A375,G-0028,H1,LSM-3347,0.75,uM,A11170096,24,h,pravastatin,trt_cp
1670,GA5-421DF-091H07,A375,G-0028,H7,LSM-4256,0.25,uM,K54997624,24,h,alpelisib,trt_cp
1688,GA5-421DF-094H10,A375,G-0028,H10,LSM-5975,10.00,uM,K93023739,24,h,IKK-inhibitor-X,trt_cp


In [173]:
combined_GCP = compounds1_2[compounds1_2.CUSTOMER_ID.isin(GCP["pert_id"])].reset_index(drop=True)

In [174]:
combined_GCP

Unnamed: 0,CUSTOMER_ID,Batch nr,Compound ID,moa,NAME,pert_iname,NAME_x,NAME_y
0,K80738081,BJ1897518,CBK041257,cytochrome P450 inhibitor|SIRT activator,Resveratrol,resveratrol,,
1,K37798499,BJ1897223,CBK041182,topoisomerase inhibitor,Etoposide,etoposide,,
2,K56343971,BJ1897084,CBK277976,RAF inhibitor,Vemurafenib,vemurafenib,,
3,K81418486,BJ1898152,CBK201016,HDAC inhibitor,"Vorinostat (SAHA, MK0683)",vorinostat,,
4,K17743125,BJ1898165,CBK277957,HDAC inhibitor,Belinostat (PXD101),belinostat,,
5,K52313696,BJ1898688,CBK277961,HDAC inhibitor,CI994 (Tacedinaline),tacedinaline,,
6,K61688984,BJ1898934,CBK303885,HDAC inhibitor,RGFP966,RGFP966,,
7,K53972329,BJ1898256,CBK277968,JAK inhibitor,Ruxolitinib (INCB018424),ruxolitinib,,
8,K87737963,BJ1898441,CBK278067,JAK inhibitor,Momelotinib (CYT387),cyt387,,
9,K79254416,BJ1895936,CBK201329,DNA methyltransferase inhibitor,decitabine,decitabine,,


In [175]:
combined_P100 = compounds1_2[compounds1_2.CUSTOMER_ID.isin(P100["pert_id"])].reset_index(drop=True)

In [176]:
combined_P100

Unnamed: 0,CUSTOMER_ID,Batch nr,Compound ID,moa,NAME,pert_iname,NAME_x,NAME_y
0,K80738081,BJ1897518,CBK041257,cytochrome P450 inhibitor|SIRT activator,Resveratrol,resveratrol,,
1,K37798499,BJ1897223,CBK041182,topoisomerase inhibitor,Etoposide,etoposide,,
2,K56343971,BJ1897084,CBK277976,RAF inhibitor,Vemurafenib,vemurafenib,,
3,K81418486,BJ1898152,CBK201016,HDAC inhibitor,"Vorinostat (SAHA, MK0683)",vorinostat,,
4,K17743125,BJ1898165,CBK277957,HDAC inhibitor,Belinostat (PXD101),belinostat,,
5,K52313696,BJ1898688,CBK277961,HDAC inhibitor,CI994 (Tacedinaline),tacedinaline,,
6,K61688984,BJ1898934,CBK303885,HDAC inhibitor,RGFP966,RGFP966,,
7,K53972329,BJ1898256,CBK277968,JAK inhibitor,Ruxolitinib (INCB018424),ruxolitinib,,
8,K87737963,BJ1898441,CBK278067,JAK inhibitor,Momelotinib (CYT387),cyt387,,
9,K79254416,BJ1895936,CBK201329,DNA methyltransferase inhibitor,decitabine,decitabine,,
