In [1]:
import pandas as pd
import numpy as np

from glob import glob

# Parse data

In [2]:
drugs = pd.DataFrame(columns=["drug", "SMILES"])

f = open("../data/smi_files/drugs.ism")
i = 0
for line in f:
    line = line.rstrip().split("\t")
    drugs.loc[i] = [line[1], line[0]]
    i += 1
f.close()
drugs.to_csv("../data/drugs.csv")

In [3]:
drugs.head()

Unnamed: 0,drug,SMILES
0,Acarbose,CC1C(NC2C(O)C(O)C(O)C(CO)=C2)C(O)C(O)C(OC3C(O)...
1,Acebutolol,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O
2,Aceclofenac,OC(COC(Cc1c(Nc2c(Cl)cccc2Cl)cccc1)=O)=O
3,Acenocoumarol,CC(CC(C1=C(O)c(c2OC1=O)cccc2)c3ccc([N+](=O)[O-...
4,Acetaminophen,CC(Nc1ccc(O)cc1)=O


### Targets known previously

In [4]:
drug_targets = pd.DataFrame(columns=["drug", "target", "pred"])

i=0
f = open("../data/old_targets.txt")
for line in f:
    d, t = line.split("\t")
    for target in t.split(";"):
        drug_targets.loc[i] = [d, target.strip(), "old"]
        i += 1
f.close()

drug_targets.head()

Unnamed: 0,drug,target,pred
0,Acarbose,AMY2A,old
1,Acarbose,GAA,old
2,Acarbose,AMY1A,old
3,Acarbose,SI,old
4,Acarbose,MGAM,old


### Targets found by Lounkine et al.

In [5]:
f = open("../data/new_targets.txt")
for line in f:
    d, t = line.split("\t")
    for target in t.split(";"):
        drug_targets.loc[i] = [d, target.strip(), "new"]
        i += 1
f.close()
drug_targets.tail()

Unnamed: 0,drug,target,pred
3884,Trazodone,ADRA2C,new
3885,Trazodone,DRD3,new
3886,Tripelennamine,ADRA1A,new
3887,Verapamil,ADRA2C,new
3888,Zafirlukast,PDE4D,new


In [6]:
#limit data to targets we are intrested in
targets = []
for i in glob("../data/targets/*"):
    name = i.split("/")[-1][:-4]
    targets.append(name)

In [7]:
idx = drug_targets.apply(lambda x: x["target"] in targets, axis=1)
drug_targets = drug_targets[idx]

In [8]:
drug_targets = pd.concat([drug_targets, pd.read_csv("../data/drugbank_targets.csv", index_col=0)])
drug_targets = drug_targets.drop_duplicates()
drug_targets.head()

Unnamed: 0,drug,target,pred
5,Acebutolol,ADRB2,old
6,Acebutolol,ADRB1,old
9,Acetaminophen,PTGS1,old
17,Acetaminophen,PTGS2,old
40,Acetophenazine,DRD1,old


In [9]:
np.sum(drug_targets["pred"] == "new")

150

In [10]:
np.sum(drug_targets["pred"] == "old")

1079

In [11]:
np.sum(drug_targets["pred"] == "drugbank")

82

In [12]:
drug_targets.to_csv("../data/drug_targets.csv")

# Find drugs' ChEMBL IDs

In [13]:
# data were obtained from ChEMBL with SMILES search
chembl_drugs = pd.read_csv("../data/chembl_drugs_data.csv", index_col=0)
chembl_drugs.shape

(1877, 4)

In [14]:
chembl_drugs.fillna("", inplace=True)
chembl_drugs.head()

Unnamed: 0,CHEMBLID,SYNONYMS,FULL_MOLFORMULA,CANONICAL_SMILES
0,CHEMBL190,Theophylline,C7H8N4O2,CN1C(=O)N(C)c2nc[nH]c2C1=O
1,CHEMBL181568,,C25H43NO18,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...
2,CHEMBL404271,,C25H43NO18,C[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O)[C@@H](O...
3,CHEMBL1566,Precose | Acarbose | BAY-G-5421 | BAY-G 5421 |...,C25H43NO18,C[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O)[C@@H](O...
4,CHEMBL1363358,,C25H43NO18,C[C@H]1OC(OC2C(O)[C@@H](O)C(OC3C(O)[C@@H](O)C(...


In [15]:
from pybel import readstring

drugs["fp"] = drugs.apply(lambda x: readstring("smi", x[1]).calcfp(), axis=1)    # FP2 fingerprint
drugs["form"] = drugs.apply(lambda x: readstring("smi", x[1]).formula, axis=1)   # formula

In [16]:
id_drug = []
id_chembl = []
syn = []

In [17]:
for idx, row in chembl_drugs.iterrows():
    fp1 = readstring("smi", row["CANONICAL_SMILES"]).calcfp()
    form1 = readstring("smi", row["CANONICAL_SMILES"]).formula
    for idx2, drug in drugs.iterrows():
        if (fp1 | drug[2]) == 1.0 and (form1 == drug[3]):            # structures match
            if str.lower(drug[0]) in str.lower(row["SYNONYMS"]):     # names match
                id_drug.append(idx2)
                id_chembl.append(idx)
                syn.append(True)
            else: 
                id_drug.append(idx2)
                id_chembl.append(idx)
                syn.append(False)

In [18]:
found = pd.DataFrame({"drug_id": id_drug, "chembl_id": id_chembl, "synonym": syn})
print found.shape
found.head()

(1588, 3)


Unnamed: 0,chembl_id,drug_id,synonym
0,0,593,True
1,1,0,False
2,2,0,False
3,3,0,True
4,4,0,False


In [19]:
#drugs with more than one matching compound
tmp = found[found["synonym"]].groupby("drug_id").count()[["chembl_id", "synonym"]]
print tmp[tmp["chembl_id"] > 1].shape
tmp[tmp["chembl_id"] > 1].head()

(85, 2)


Unnamed: 0_level_0,chembl_id,synonym
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1
28,2,2
40,2,2
48,3,3
57,2,2
76,2,2


In [20]:
drug_idx = tmp[tmp["chembl_id"] == 1].index
drug_idx

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            644, 645, 646, 647, 649, 650, 651, 652, 653, 655],
           dtype='int64', name=u'drug_id', length=554)

In [21]:
drugs.loc[drug_idx].head()

Unnamed: 0_level_0,drug,SMILES,fp,form
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Acarbose,CC1C(NC2C(O)C(O)C(O)C(CO)=C2)C(O)C(O)C(OC3C(O)...,"33565184, 77725696, 262144, 514, 3019898913, 4...",C25H43NO18
1,Acebutolol,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,"1600, 2147483648, 393219, 541098752, 276824192...",C18H28N2O4
2,Aceclofenac,OC(COC(Cc1c(Nc2c(Cl)cccc2Cl)cccc1)=O)=O,"570557440, 8192, 2494466, 33288, 4256, 3355444...",C16H13Cl2NO4
3,Acenocoumarol,CC(CC(C1=C(O)c(c2OC1=O)cccc2)c3ccc([N+](=O)[O-...,"67109888, 33554432, 84297730, 1660952704, 1250...",C19H15NO6
4,Acetaminophen,CC(Nc1ccc(O)cc1)=O,"512, 0, 393217, 32768, 128, 262144, 0, 0, 0, 5...",C8H9NO2


In [22]:
chembl_idx = pd.Index(found[found["synonym"]].set_index("drug_id").loc[drug_idx]["chembl_id"])
chembl_idx

Int64Index([   3,    6,    8,    9,   11,   16,   18,   20,   21,   24,
            ...
            1845, 1847, 1851, 1852, 1863, 1865, 1867, 1868, 1870, 1876],
           dtype='int64', name=u'chembl_id', length=554)

In [23]:
chembl_drugs.loc[chembl_idx].head()

Unnamed: 0_level_0,CHEMBLID,SYNONYMS,FULL_MOLFORMULA,CANONICAL_SMILES
chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,CHEMBL1566,Precose | Acarbose | BAY-G-5421 | BAY-G 5421 |...,C25H43NO18,C[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O)[C@@H](O...
6,CHEMBL642,Sectral | IL-17803A | Acebutolol | M&B-17803A ...,C18H28N2O4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
8,CHEMBL93645,Aceclofenac,C16H13Cl2NO4,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl
9,CHEMBL397420,Nicoumalone | Acenocoumarin | Acenocoumarol,C19H15NO6,CC(=O)CC(C1=C(O)c2ccccc2OC1=O)c3ccc(cc3)[N+](=...
11,CHEMBL112,Acephen | Neopap | Datril | APAP | Ofirmev | I...,C8H9NO2,CC(=O)Nc1ccc(O)cc1


In [24]:
found_drugs = drugs.loc[drug_idx][["drug", "SMILES"]]
found_drugs["chembl_id"] = np.array(chembl_drugs.loc[chembl_idx, "CHEMBLID"])
found_drugs.head()

Unnamed: 0_level_0,drug,SMILES,chembl_id
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Acarbose,CC1C(NC2C(O)C(O)C(O)C(CO)=C2)C(O)C(O)C(OC3C(O)...,CHEMBL1566
1,Acebutolol,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,CHEMBL642
2,Aceclofenac,OC(COC(Cc1c(Nc2c(Cl)cccc2Cl)cccc1)=O)=O,CHEMBL93645
3,Acenocoumarol,CC(CC(C1=C(O)c(c2OC1=O)cccc2)c3ccc([N+](=O)[O-...,CHEMBL397420
4,Acetaminophen,CC(Nc1ccc(O)cc1)=O,CHEMBL112


In [25]:
found_drugs.shape

(554, 3)

In [26]:
uncl_drug_idx = tmp[tmp["chembl_id"] > 1].index
print drugs.loc[uncl_drug_idx][["drug"]].shape
drugs.loc[uncl_drug_idx][["drug"]].head()

(85, 1)


Unnamed: 0_level_0,drug
drug_id,Unnamed: 1_level_1
28,Amlodipine
40,Apomorphine
48,Atenolol
57,Baclofen
76,Bicalutamide


In [27]:
uncl_chembl_idx = found[found["synonym"]].set_index("drug_id").loc[uncl_drug_idx]["chembl_id"]
uncl_chembl_idx = uncl_chembl_idx.reset_index()
print uncl_chembl_idx.shape
uncl_chembl_idx.head()

(204, 2)


Unnamed: 0,drug_id,chembl_id
0,28,63
1,28,64
2,40,100
3,40,101
4,48,122


In [28]:
found_drugs_multi = drugs.loc[pd.Index(uncl_chembl_idx["drug_id"])][["drug", "SMILES"]]
found_drugs_multi["chembl_id"] = np.array(chembl_drugs.loc[pd.Index(uncl_chembl_idx["chembl_id"]), "CHEMBLID"])
found_drugs_multi.head()

Unnamed: 0,drug,SMILES,chembl_id
28,Amlodipine,CCOC(C(C(c1ccccc1Cl)C(C(OC)=O)=C(C)N2)=C2COCCN)=O,CHEMBL1491
28,Amlodipine,CCOC(C(C(c1ccccc1Cl)C(C(OC)=O)=C(C)N2)=C2COCCN)=O,CHEMBL2111097
40,Apomorphine,[H]C1(c(c2c3c(ccc(O)c3O)C1)c4ccc2)N(C)CC4,CHEMBL53
40,Apomorphine,[H]C1(c(c2c3c(ccc(O)c3O)C1)c4ccc2)N(C)CC4,CHEMBL416288
48,Atenolol,CC(NCC(COc1ccc(CC(=O)N)cc1)O)C,CHEMBL24


In [29]:
found_drugs = pd.concat([found_drugs, found_drugs_multi])
found_drugs.tail()

Unnamed: 0_level_0,drug,SMILES,chembl_id
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
648,Zileuton,CC(c1sc2c(cccc2)c1)N(C(=O)N)O,CHEMBL93
648,Zileuton,CC(c1sc2c(cccc2)c1)N(C(=O)N)O,CHEMBL1164975
648,Zileuton,CC(c1sc2c(cccc2)c1)N(C(=O)N)O,CHEMBL1743359
654,Zopiclone,CN1CCN(C(OC2c(c3C(=O)N2c4ccc(Cl)cn4)nccn3)=O)CC1,CHEMBL135400
654,Zopiclone,CN1CCN(C(OC2c(c3C(=O)N2c4ccc(Cl)cn4)nccn3)=O)CC1,CHEMBL1522


In [30]:
d = list(found_drugs["drug"].unique())
not_found = drugs[drugs.apply(lambda x: x["drug"] not in d, axis=1)]
not_found.head()

Unnamed: 0,drug,SMILES,fp,form
65,Benzonatate,CCCCNc1ccc(cc1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC,"512, 3221225472, 393216, 33280, 268435616, 524...",C30H53NO11
121,Chlorphenesin,OCC(COc1ccc(Cl)cc1)O,"131072, 0, 131072, 512, 8388608, 1048576, 0, 8...",C9H11ClO3
151,Croconazole hydrochloride,C=C(n1cncc1)c2c(OCc3cc(Cl)ccc3)cccc2,"132096, 1141637120, 2152071176, 79692544, 2147...",C18H15ClN2O
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,"1024, 131072, 16908290, 2, 0, 1179648, 2147483...",C18H24O2


In [31]:
tmp = found.set_index("drug_id").loc[not_found.index]
tmp

Unnamed: 0_level_0,chembl_id,synonym
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1
65,,
121,326.0,False
151,406.0,False
218,616.0,False
218,617.0,False
218,618.0,False
218,619.0,False
218,620.0,False
218,621.0,False
218,622.0,False


In [32]:
tmp2 = tmp.groupby(tmp.index).count()
tmp2[tmp2["chembl_id"] == 1]

Unnamed: 0_level_0,chembl_id,synonym
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1
121,1,1
151,1,1
279,1,1
328,1,1
359,1,1
514,1,1
642,1,1


In [33]:
idx = tmp2[tmp2["chembl_id"] == 1].index
tmp.loc[idx]

Unnamed: 0_level_0,chembl_id,synonym
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1
121,326.0,False
151,406.0,False
279,770.0,False
328,901.0,False
359,985.0,False
514,1439.0,False
642,1826.0,False


In [34]:
found_drugs_nonsym = drugs.loc[idx][["drug", "SMILES"]]
found_drugs_nonsym["chembl_id"] = np.array(chembl_drugs.loc[pd.Index(tmp.loc[idx]["chembl_id"]), "CHEMBLID"])
found_drugs_nonsym

Unnamed: 0_level_0,drug,SMILES,chembl_id
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
121,Chlorphenesin,OCC(COc1ccc(Cl)cc1)O,CHEMBL388751
151,Croconazole hydrochloride,C=C(n1cncc1)c2c(OCc3cc(Cl)ccc3)cccc2,CHEMBL27289
279,Genistein,Oc1ccc(C(C(=O)c2c(cc(O)cc2O)O3)=C3)cc1,CHEMBL44
328,Lafutidine,O=C(NCC=CCOc1nccc(CN2CCCCC2)c1)CS(Cc3occc3)=O,CHEMBL3191588
359,Medroxyprogesterone,[H]C1(C2([H])C(C(C)(C(=C3)C(C)C2)CCC3=O)([H])C...,CHEMBL1390
514,Rabeprazole sodium,COCCCOc1ccnc(CS(c2[nH]c3c(cccc3)n2)=O)c1C,CHEMBL1615209
642,Warfarin,CC(CC(C(C(=O)c1c(cccc1)O2)=C2O)c3ccccc3)=O,CHEMBL7252


In [35]:
found_drugs = pd.concat([found_drugs, found_drugs_nonsym])
len(found_drugs["drug"].unique())

646

In [36]:
idx = tmp2[tmp2["chembl_id"] > 1].index
tmp.loc[idx]

Unnamed: 0_level_0,chembl_id,synonym
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1
218,616.0,False
218,617.0,False
218,618.0,False
218,619.0,False
218,620.0,False
218,621.0,False
218,622.0,False
225,640.0,False
225,641.0,False
225,642.0,False


In [37]:
idx_drug_tc = tmp.loc[idx].index
drugs.loc[idx_drug_tc]

Unnamed: 0,drug,SMILES,fp,form
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,"1024, 1107296256, 212994, 536937344, 108213043...",C30H26O12
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,"1024, 131072, 16908290, 2, 0, 1179648, 2147483...",C18H24O2
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,"1024, 131072, 16908290, 2, 0, 1179648, 2147483...",C18H24O2
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,"1024, 131072, 16908290, 2, 0, 1179648, 2147483...",C18H24O2


In [38]:
idx_chembl_tc = pd.Index(tmp.loc[idx]["chembl_id"])
chembl_drugs.loc[idx_chembl_tc]

Unnamed: 0_level_0,CHEMBLID,SYNONYMS,FULL_MOLFORMULA,CANONICAL_SMILES
chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
616.0,CHEMBL38714,Procyanidin B2,C30H26O12,O[C@@H]1Cc2c(O)cc(O)c([C@@H]3[C@@H](O)[C@H](Oc...
617.0,CHEMBL81753,,C30H26O12,OC1Cc2c(O)cc(O)c([C@@H]3C(O)[C@H](Oc4cc(O)cc(O...
618.0,CHEMBL504937,Procyanidin B1,C30H26O12,O[C@H]1Cc2c(O)cc(O)c([C@@H]3[C@@H](O)[C@H](Oc4...
619.0,CHEMBL501490,,C30H26O12,O[C@H]1Cc2c(O)cc(O)c([C@H]3[C@H](O)[C@H](Oc4cc...
620.0,CHEMBL447373,,C30H26O12,O[C@@H]1Cc2c(O)cc(O)c([C@H]3[C@H](O)[C@H](Oc4c...
621.0,CHEMBL1253314,,C30H26O12,OC1Cc2c(O)cc(O)c(C3C(O)C(Oc4cc(O)cc(O)c34)c5cc...
622.0,CHEMBL1590914,,C30H26O12,O[C@@H]1Cc2c(O)cc(O)c([C@H]3[C@H](O)[C@H](Oc4c...
640.0,CHEMBL412,,C18H24O2,CC12CCC3C(CCc4cc(O)ccc34)C1CCC2O
641.0,CHEMBL135,ECP | Alora | Esclim | Aquagen | Climara | Div...,C18H24O2,C[C@]12CC[C@H]3[C@@H](CCc4cc(O)ccc34)[C@@H]1CC...
642.0,CHEMBL286452,Alfatradiol | Estradiol sulfate,C18H24O2,C[C@]12CC[C@H]3[C@@H](CCc4cc(O)ccc34)[C@@H]1CC...


In [39]:
found_drugs_tc = drugs.loc[idx_drug_tc][["drug", "SMILES"]]
found_drugs_tc["chembl_id"] = np.array(chembl_drugs.loc[idx_chembl_tc, "CHEMBLID"])
found_drugs_tc

Unnamed: 0,drug,SMILES,chembl_id
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL38714
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL81753
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL504937
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL501490
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL447373
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL1253314
218,Epicatechin,C1(C(O)C(c2ccc(O)c(O)c2)Oc3c1c(O)cc(O)c3)c4c5c...,CHEMBL1590914
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL412
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL135
225,Estradiol beta,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL286452


In [40]:
found_drugs = pd.concat([found_drugs, found_drugs_tc])
found_drugs.drop_duplicates(inplace=True)
found_drugs.shape, len(found_drugs["drug"].unique())

((788, 3), 652)

In [41]:
d = list(found_drugs["drug"].unique())
for i in drugs["drug"]:
    if i not in d:
        print i

Benzonatate
Gamma hydroxybutyrate
Pantoprazole sodium
Vancomycin


For those compounds we searched ChEMBL manually and found out that SMILES differ between data suorces.

In [42]:
#Benzonatate
ds = readstring("smi", "CCCCCNc1ccc(C(OCCOCCOCCOCCOCCOCCOCCOCCOCCOCC)=O)cc1")   #Lounkine paper
dc = readstring("smi", "CCCCNc1ccc(cc1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC")     #chembl
dd = readstring("smi", "CCCCNC1=CC=C(C=C1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC")  #drugbank
print ds.calcfp() | dc.calcfp()
print ds.calcfp() | dd.calcfp()
print dd.calcfp() | dc.calcfp()

0.958333333333
0.958333333333
1.0


For Benzonatate we use SMILES from ChEMBL
________________________________________________________

Gamma hydroxybutyrate == Oxybate in ChEMBL

Structures match, but protonation states differ
_____________________________________________

In [43]:
#Pantoprazole sodium
ds = readstring("smi", "COc1ccnc(CS(c2[nH]c3c(ccc(OC(F)F)c3)n2)=O)c1OC")       #Lounkine paper
dc = readstring("smi", "COc1ccnc(C[S+]([O-])c2nc3cc(OC(F)F)ccc3[nH]2)c1OC")    #chembl
dd = readstring("smi", "COC1=C(OC)C(CS(=O)C2=NC3=C(N2)C=C(OC(F)F)C=C3)=NC=C1") #drugbank
print ds.calcfp() | dc.calcfp()
print ds.calcfp() | dd.calcfp()
print dd.calcfp() | dc.calcfp()

0.802469135802
1.0
0.802469135802


In [44]:
#Vancomycin
ds = readstring("smi", "CNC(C(NC1C(=O)NC(CC(=O)N)C(=O)NC2c(cc3Oc(c(cc4C1O)Cl)cc4)cc(Oc5c(Cl)cc(cc5)C(O)C(NC(=O)C6NC2=O)C(=O)NC(C(=O)O)c(c7c8cc6ccc8O)cc(O)cc7O)c3OC9C(OC%10OC(C)C(O)C(N)(C)C%10)C(O)C(O)C(CO)O9)=O)CC(C)C")   #Lounkine paper
dc = readstring("smi", "CN[C@H](CC(C)C)C(=O)N[C@@H]1[C@H](O)c2ccc(Oc3cc4cc(Oc5ccc(cc5Cl)[C@@H](O)[C@@H]6NC(=O)[C@H](NC(=O)[C@@H]4NC(=O)[C@H](CC(=O)N)NC1=O)c7ccc(O)c(c7)c8c(O)cc(O)cc8[C@H](NC6=O)C(=O)O)c3O[C@@H]9O[C@H](CO)[C@@H](O)[C@H](O)[C@H]9O[C@H]%10C[C@](C)(N)[C@H](O)[C@H](C)O%10)c(Cl)c2")    #chembl
dd = readstring("smi", "CN[C@H](CC(C)C)C(=O)N[C@@H]1[C@H](O)C2=CC=C(OC3=C(O[C@@H]4O[C@H](CO)[C@@H](O)[C@H](O)[C@H]4O[C@H]4C[C@](C)(N)[C@H](O)[C@H](C)O4)C4=CC(=C3)[C@@H](NC(=O)[C@H](CC(N)=O)NC1=O)C(=O)N[C@@H]1C3=CC(=C(O)C=C3)C3=C(O)C=C(O)C=C3[C@H](NC(=O)[C@@H](NC1=O)[C@H](O)C1=CC(Cl)=C(O4)C=C1)C(O)=O)C(Cl)=C2")    #drugbank
print ds.calcfp() | dc.calcfp()
print ds.calcfp() | dd.calcfp()
print dd.calcfp() | dc.calcfp()

1.0
1.0
1.0


In [45]:
tmp = pd.DataFrame({"drug": ["Benzonatate", "Gamma hydroxybutyrate", "Pantoprazole sodium", "Vancomycin"],
                    "SMILES": ["CCCCNc1ccc(cc1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC",
                               "OCCCC(=O)[O-]",
                               "COc1ccnc(CS(c2[nH]c3c(ccc(OC(F)F)c3)n2)=O)c1OC",
                               "CNC(C(NC1C(=O)NC(CC(=O)N)C(=O)NC2c(cc3Oc(c(cc4C1O)Cl)cc4)cc(Oc5c(Cl)cc(cc5)C(O)C(NC(=O)C6NC2=O)C(=O)NC(C(=O)O)c(c7c8cc6ccc8O)cc(O)cc7O)c3OC9C(OC%10OC(C)C(O)C(N)(C)C%10)C(O)C(O)C(CO)O9)=O)CC(C)C"],
                    "chembl_id": ["CHEMBL1374379", "CHEMBL1342", "CHEMBL1502", "CHEMBL262777"]})
found_drugs = pd.concat([found_drugs, tmp])

In [46]:
print found_drugs.shape
found_drugs.tail(20)

(792, 3)


Unnamed: 0_level_0,SMILES,chembl_id,drug
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
225,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL412,Estradiol beta
225,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL135,Estradiol beta
225,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL286452,Estradiol beta
225,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL1395196,Estradiol beta
225,[H]C1(C([H])(C2([H])CC3)CCc4c2ccc(O)c4)C3(C)C(...,CHEMBL1555366,Estradiol beta
227,CCC(NCCNC(CO)CC)CO,CHEMBL25539,Ethambutol dihydrochloride
227,CCC(NCCNC(CO)CC)CO,CHEMBL44884,Ethambutol dihydrochloride
227,CCC(NCCNC(CO)CC)CO,CHEMBL43824,Ethambutol dihydrochloride
227,CCC(NCCNC(CO)CC)CO,CHEMBL1201318,Ethambutol dihydrochloride
280,CC(=CCCC(=CCCC(=CCCC(=CCCC(=O)C)C)C)C)C,CHEMBL79686,Geranylgeranylacetone


In [47]:
found_drugs.reset_index(drop=True, inplace=True)
found_drugs.tail()

Unnamed: 0,SMILES,chembl_id,drug
787,c(cccc1)(c(CC(C(=O)O)NC(=O)CCC(N)C(=O)O)c[nH]2...,CHEMBL3137306,Timodepressin
788,CCCCNc1ccc(cc1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC,CHEMBL1374379,Benzonatate
789,OCCCC(=O)[O-],CHEMBL1342,Gamma hydroxybutyrate
790,COc1ccnc(CS(c2[nH]c3c(ccc(OC(F)F)c3)n2)=O)c1OC,CHEMBL1502,Pantoprazole sodium
791,CNC(C(NC1C(=O)NC(CC(=O)N)C(=O)NC2c(cc3Oc(c(cc4...,CHEMBL262777,Vancomycin


In [48]:
found_drugs = found_drugs[["drug", "chembl_id", "SMILES"]]
found_drugs.tail()

Unnamed: 0,drug,chembl_id,SMILES
787,Timodepressin,CHEMBL3137306,c(cccc1)(c(CC(C(=O)O)NC(=O)CCC(N)C(=O)O)c[nH]2...
788,Benzonatate,CHEMBL1374379,CCCCNc1ccc(cc1)C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOC
789,Gamma hydroxybutyrate,CHEMBL1342,OCCCC(=O)[O-]
790,Pantoprazole sodium,CHEMBL1502,COc1ccnc(CS(c2[nH]c3c(ccc(OC(F)F)c3)n2)=O)c1OC
791,Vancomycin,CHEMBL262777,CNC(C(NC1C(=O)NC(CC(=O)N)C(=O)NC2c(cc3Oc(c(cc4...


In [49]:
found_drugs.to_csv("../data/drugs_chembl.csv")

# Trivial hits

In [50]:
target_data = pd.read_csv("../data/cleaned_data_smiles.csv",
                          index_col=0)[["name", "ingredient_cmpd_chemblid"]].drop_duplicates()
target_data.columns = ["target", "chembl_id"]
target_data.head()

Unnamed: 0,target,chembl_id
0,AVPR1A,CHEMBL59740
1,AVPR1A,CHEMBL334082
2,AVPR2,CHEMBL334082
3,AVPR1A,CHEMBL333135
4,AVPR2,CHEMBL333135


In [51]:
trivial = pd.merge(found_drugs, target_data)

In [52]:
trivial.head(20)

Unnamed: 0,drug,chembl_id,SMILES,target
0,Acebutolol,CHEMBL642,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,ADRB1
1,Acetophenazine,CHEMBL1085,CC(c1ccc2c(N(CCCN3CCN(CCO)CC3)c4c(cccc4)S2)c1)=O,AR
2,Alosetron,CHEMBL1110,Cn1c2c(cccc2)c3c1CCN(Cc4nc[nH]c4C)C3=O,HTR3A
3,Alosetron,CHEMBL1110,Cn1c2c(cccc2)c3c1CCN(Cc4nc[nH]c4C)C3=O,HTR2B
4,Alprazolam,CHEMBL661,Cc1n(c2nn1)c3c(cc(Cl)cc3)C(c4ccccc4)=NC2,GABRA1
5,Amiodarone,CHEMBL633,CCCCc1oc2c(cccc2)c1C(c3cc(I)c(OCCN(CC)CC)c(I)c...,DRD3
6,Amiodarone,CHEMBL633,CCCCc1oc2c(cccc2)c1C(c3cc(I)c(OCCN(CC)CC)c(I)c...,ADRA2A
7,Amiodarone,CHEMBL633,CCCCc1oc2c(cccc2)c1C(c3cc(I)c(OCCN(CC)CC)c(I)c...,HTR2C
8,Amiodarone,CHEMBL633,CCCCc1oc2c(cccc2)c1C(c3cc(I)c(OCCN(CC)CC)c(I)c...,CHRM1
9,Amiodarone,CHEMBL633,CCCCc1oc2c(cccc2)c1C(c3cc(I)c(OCCN(CC)CC)c(I)c...,KCNH2


In [53]:
trivial.shape, trivial[["drug", "target"]].drop_duplicates().shape

((1107, 4), (1050, 2))

In [54]:
trivial.to_csv("../data/trivial_hits.csv")

# Hits with low activity

Drugs with activity better than 30$\mu M$, but worse than 1$\mu M$

In [55]:
raw_data = pd.read_csv("../data/raw_data.csv", index_col=0)
raw_data.head()

Unnamed: 0,bioactivity_type,ingredient_cmpd_chemblid,name,operator,target_chemblid,uniprot_name,units,value
0,IC50,CHEMBL59740,AVPR1A,=,CHEMBL2868,V1AR_RAT,nM,320.0
1,Inhibition,CHEMBL59740,AVPR1A,=,CHEMBL2868,V1AR_RAT,%,65.0
2,IC50,CHEMBL119671,AVPR1A,=,CHEMBL2868,V1AR_RAT,nM,2800.0
3,IC50,CHEMBL334082,AVPR1A,=,CHEMBL2868,V1AR_RAT,nM,1000.0
4,IC50,CHEMBL331098,AVPR1A,=,CHEMBL2868,V1AR_RAT,nM,2400.0


In [56]:
chembl_found = pd.merge(found_drugs, raw_data, left_on="chembl_id", right_on="ingredient_cmpd_chemblid")
chembl_found.head()

Unnamed: 0,drug,chembl_id,SMILES,bioactivity_type,ingredient_cmpd_chemblid,name,operator,target_chemblid,uniprot_name,units,value
0,Acebutolol,CHEMBL642,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,Log Ki,CHEMBL642,HTR1A,=,CHEMBL273,5HT1A_RAT,Unspecified,5.0
1,Acebutolol,CHEMBL642,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,IC50,CHEMBL642,ADRB1,=,CHEMBL213,ADRB1_HUMAN,nM,731.0
2,Acebutolol,CHEMBL642,CCCC(Nc1ccc(OCC(CNC(C)C)O)c(C(=O)C)c1)=O,Ki,CHEMBL642,ADRB1,=,CHEMBL213,ADRB1_HUMAN,nM,422.0
3,Acetaminophen,CHEMBL112,CC(Nc1ccc(O)cc1)=O,IC50,CHEMBL112,PTGS2,=,CHEMBL230,PGH2_HUMAN,nM,141000.0
4,Acetaminophen,CHEMBL112,CC(Nc1ccc(O)cc1)=O,IC50,CHEMBL112,PTGS2,=,CHEMBL230,PGH2_HUMAN,nM,141253.75


In [57]:
chembl_found = chembl_found[["name", "target_chemblid", "drug", "ingredient_cmpd_chemblid", "bioactivity_type",
                             "operator", "value", "units"]]
chembl_found.columns = ["target", "target_chemblid", "drug", "drug_chemblid", "bioactivity_type", "operator",
                        "value", "units"]
chembl_found.head()

Unnamed: 0,target,target_chemblid,drug,drug_chemblid,bioactivity_type,operator,value,units
0,HTR1A,CHEMBL273,Acebutolol,CHEMBL642,Log Ki,=,5.0,Unspecified
1,ADRB1,CHEMBL213,Acebutolol,CHEMBL642,IC50,=,731.0,nM
2,ADRB1,CHEMBL213,Acebutolol,CHEMBL642,Ki,=,422.0,nM
3,PTGS2,CHEMBL230,Acetaminophen,CHEMBL112,IC50,=,141000.0,nM
4,PTGS2,CHEMBL230,Acetaminophen,CHEMBL112,IC50,=,141253.75,nM


In [58]:
chembl_found.shape

(17293, 8)

In [59]:
chembl_known = chembl_found.apply(lambda x: ((drug_targets["drug"] == x["drug"]) &
                                              (drug_targets["target"] == x["target"])).any(), axis=1)

In [60]:
chembl_triv = chembl_found.apply(lambda x: ((trivial["drug"] == x["drug"]) &
                                              (trivial["target"] == x["target"])).any(), axis=1)

In [61]:
chembl_possible_hits = chembl_found[(~chembl_known) & (~chembl_triv)].drop_duplicates()
chembl_possible_hits.head()

Unnamed: 0,target,target_chemblid,drug,drug_chemblid,bioactivity_type,operator,value,units
0,HTR1A,CHEMBL273,Acebutolol,CHEMBL642,Log Ki,=,5.0,Unspecified
6,CNR1,CHEMBL218,Acetaminophen,CHEMBL112,Activity,=,13.0,%
7,ESR1,CHEMBL206,Acetaminophen,CHEMBL112,Potency,Unspecified,31622.8,nM
9,NR3C1,CHEMBL2034,Acetazolamide,CHEMBL20,Potency,Unspecified,25118.9,nM
10,CHRM1,CHEMBL276,Acetohexamide,CHEMBL1589,Potency,=,14125.4,nM


In [62]:
chembl_possible_hits.shape

(1713, 8)

In [63]:
chembl_possible_hits["bioactivity_type"].unique()

array(['Log Ki', 'Activity', 'Potency', 'IC50', 'Ki', 'Inhibition',
       'Inotropic effect', 'ED50', 'EC50', 'K0.5', 'Enhancement',
       'Mean enhancement of binding', 'I', 'RBA', 'Rmax', 'KB',
       'max activation', 'EC150', 'IC20', 'Fold change', 'Emax', 'pKx',
       'Weight', 'pKD', 'Kd', 'DR10', 'ID50', 'Ratio', 'ED30', 'KL',
       'Log 1/Kd', 'pKB', 'FC', 'Binding', 'Relative activity', 'log(RBA)',
       'Binding activity', 'Log EC50', 'Change', 'Ke'], dtype=object)

In [64]:
idx_type = ((chembl_possible_hits["bioactivity_type"] == "IC50") |
            (chembl_possible_hits["bioactivity_type"] == "EC50") | 
            (chembl_possible_hits["bioactivity_type"] == "Ki") |
            (chembl_possible_hits["bioactivity_type"] == "Kd"))
idx_value = (chembl_possible_hits["value"] < 30000) 
idx_units = (chembl_possible_hits["units"] == "nM")
idx_op = ((chembl_possible_hits["operator"] == "=") | (chembl_possible_hits["operator"] == "<"))

In [65]:
chemb_low_activity = chembl_possible_hits[idx_type & idx_value & idx_units & idx_op]
chemb_low_activity.head()

Unnamed: 0,target,target_chemblid,drug,drug_chemblid,bioactivity_type,operator,value,units
65,SLC6A4,CHEMBL228,Ambroxol,CHEMBL153479,IC50,=,3391.0,nM
66,SLC6A4,CHEMBL228,Ambroxol,CHEMBL153479,Ki,=,1802.0,nM
70,MAOA,CHEMBL1951,Amiloride,CHEMBL945,IC50,=,3909.0,nM
72,CHRM2,CHEMBL211,Amiodarone,CHEMBL633,IC50,=,5080.0,nM
73,CHRM2,CHEMBL211,Amiodarone,CHEMBL633,Ki,=,1806.0,nM


In [66]:
chemb_low_activity.to_csv("../data/chembl_low_activity_hits.csv")