In [1]:
%load_ext autoreload

In [2]:
import json
import gzip

import pandas as pd

### InChiKey to SMILES dictionary

In [3]:
save_path = "../../data/Hansaim/integrated_InChiKey2SMILES.json.gz"
try:
    with gzip.open(save_path) as f:
        k2s = json.load(f)
except FileNotFoundError:
    path = "../../data/Hansaim/integrated_InChiKey2Index.json.gz"
    with gzip.open(path) as f:
        k2i = json.load(f)
    path = "../../data/Hansaim/integrated_Index2SMILES.json.gz"
    with gzip.open(path) as f:
        i2s = json.load(f)

    k2s = dict()
    for key, value in k2i.items():
        k2s[key] = i2s[str(value)]
    with gzip.open(save_path, "wb") as f:
        k2s_json = json.dumps(k2s)
        f.write(k2s_json.encode())

### Test the dictionary

In [4]:
from rdkit import Chem
for i in range(3):
    print(list(k2s.keys())[i], k2s[list(k2s.keys())[i]])
    mol = Chem.MolFromSmiles(k2s[list(k2s.keys())[i]])
    print(Chem.inchi.MolToInchiKey(mol))
    print("="*40)

AAAAEENPAALFRN-UHFFFAOYSA-N COc1cc(C(C)C)c(Oc2cnc(NCCS(=O)(=O)C)nc2N)cc1I
AAAAEENPAALFRN-UHFFFAOYSA-N
AAAAKTROWFNLEP-UHFFFAOYSA-N CC1CNC(=O)c2[nH]c3ccc(cc3c12)C(=O)N(C)C
AAAAKTROWFNLEP-UHFFFAOYSA-N
AAAATQFUBIBQIS-IRXDYDNUSA-N CC(=O)NC[C@@H]1OC(=O)N2[C@H]1Cc3cc(ccc23)c4cccnc4
AAAATQFUBIBQIS-IRXDYDNUSA-N


### Load activity file

* pIC50 file

In [5]:
picf_path = "../../data/Hansaim/integrated_pic50.tsv.gz"
pic_act_df = pd.read_csv(picf_path, sep="\t", index_col=0)

In [6]:
pic_act_df.head()

Unnamed: 0,InChIKey,UniProt,Activity_type,Relation,Activity_value
0,LMSNSIQAWQSERP-UHFFFAOYSA-N,P08908,pIC50,=,7.199971
1,GKKAQRFMUGUZQS-UHFFFAOYSA-N,P08908,pIC50,=,7.59998
2,FKWIKKLASAZVLL-UHFFFAOYSA-N,P08908,pIC50,=,7.199971
3,IIYLGSKFTYDEID-UHFFFAOYSA-N,P08908,pIC50,=,6.299998
4,YOPSWVMUSWBXTO-UHFFFAOYSA-N,P08908,pIC50,=,6.699992


* pKd file

In [7]:
pkdf_path = "../../data/Hansaim/integrated_pkd.tsv.gz"
pkd_act_df = pd.read_csv(pkdf_path, sep="\t", index_col=0)

In [8]:
pkd_act_df.head()

Unnamed: 0,InChIKey,UniProt,Activity_type,Relation,Activity_value
0,UHTHHESEBZOYNR-UHFFFAOYSA-N,Q9Y4K4,pKd,=,6.346787
1,UIARLYUEJFELEN-LROUJFHJSA-N,P0C264,pKd,=,6.920819
2,GCIKSSRWRFVXBI-UHFFFAOYSA-N,P0C264,pKd,=,6.275724
3,GCIKSSRWRFVXBI-UHFFFAOYSA-N,P30530,pKd,=,6.677781
4,JOOXLOJCABQBSG-UHFFFAOYSA-N,P00519(Y253F)-phosphorylated,pKd,=,7.251812


* pKi file

In [9]:
pkif_path = "../../data/Hansaim/integrated_pki.tsv.gz"
pki_act_df = pd.read_csv(pkif_path, sep="\t", index_col=0)

In [10]:
pki_act_df.head()

Unnamed: 0,InChIKey,UniProt,Activity_type,Relation,Activity_value
0,UZWDCWONPYILKI-UHFFFAOYSA-N,Q2M2I8,pKi,=,6.992124
1,UZWDCWONPYILKI-UHFFFAOYSA-N,Q86TW2,pKi,=,7.400008
2,UZWDCWONPYILKI-UHFFFAOYSA-N,Q9NSY1,pKi,=,4.251567
3,UZWDCWONPYILKI-UHFFFAOYSA-N,Q13557,pKi,=,7.71108
4,UZWDCWONPYILKI-UHFFFAOYSA-N,Q13555,pKi,=,7.74739


### Load required UniProt ID list

In [11]:
nega_df = pd.read_csv("../../data/Hansaim/JAKs_negatives.csv", header=None)
nega_df.head()

Unnamed: 0,0,1,2,3
0,A1Z199,A1Z199_HUMAN,BCR/ABL fusion,BCR/ABL p210 fusion protein
1,P11274,BCR_HUMAN,BCR,Breakpoint cluster region protein
2,Q12830,BPTF_HUMAN,BPTF,Nucleosome-remodeling factor subunit BPTF
3,P08962,CD63_HUMAN,CD63,CD63 antigen
4,Q9BXF3,CECR2_HUMAN,CECR2,Cat eye syndrome critical region protein 2


In [12]:
nega_list = list(nega_df[0])
print(nega_list)

['A1Z199', 'P11274', 'Q12830', 'P08962', 'Q9BXF3', 'O94907', 'P97772', 'Q9NPC2', 'Q9Y6K9', 'Q9GZQ6', 'Q9Y5X5', 'Q460N5', 'P11103', 'O77746', 'P11541', 'O97554', 'O02768', 'P23907', 'Q9HUN3', 'Q9HBX9', 'Q9P0U3', 'Q9HC62', 'P21675', 'Q9NQB0', 'Q61143']


### Find active compounds with target in the list

In [13]:
active_inchikeys = list()

##### pIC50 > 4.0

In [14]:
in_list = pic_act_df["UniProt"].str.upper().isin(nega_list)
relation = pic_act_df["Relation"].isin(["=",">",">=",">>"])
lt_ic50 = pic_act_df["Activity_value"] >= 4. 
active_inchikeys.extend(list(pic_act_df.loc[in_list & relation & lt_ic50]["InChIKey"]))

In [15]:
len(active_inchikeys)

984

##### pKd > 7.0

In [16]:
in_list = pkd_act_df["UniProt"].str.upper().isin(nega_list)
relation = pkd_act_df["Relation"].isin(["=",">",">=",">>"])
lt_pkd = pkd_act_df["Activity_value"] >= 7.
active_inchikeys.extend(list(pkd_act_df.loc[in_list & relation & lt_pkd]["InChIKey"]))

In [17]:
len(active_inchikeys)

1119

##### pKi > 7.0

In [18]:
in_list = pki_act_df["UniProt"].str.upper().isin(nega_list)
relation = pki_act_df["Relation"].isin(["=",">",">=",">>"])
lt_pki = pki_act_df["Activity_value"] >= 7.
active_inchikeys.extend(list(pki_act_df.loc[in_list & relation & lt_pki]["InChIKey"]))

In [19]:
len(active_inchikeys)

1224

#### Remove overlapping

In [20]:
ori_l = len(active_inchikeys)
active_inchikeys = set(active_inchikeys)
print(ori_l, "->", len(active_inchikeys))

1224 -> 687


### Convert InChIKeys to SMILES

In [21]:
active_smiles = list()
for k in active_inchikeys:
    try:
        active_smiles.append(k2s[k])
    except KeyError:
        continue
print(len(active_smiles))

662


### Convert SMILES to graphs

In [22]:
from chemreader.writers import GraphWriter
from chemreader.readers import Smiles

In [23]:
smiles = [Smiles(s, sanitize=True) for s in active_smiles]

In [24]:
writer = GraphWriter(smiles)
writer.write("../../data/JAK/graphs", prefix="JAK_negative_integrated")

In [25]:
with open("../../data/Hansaim/JAKs_negative_integrated_SMILES.txt", "w") as f:
    for smiles in active_smiles:
        f.write(smiles+"\n") 

In [26]:
with open("../../data/Hansaim/JAKs_negative_threshold_4_SMILES.txt", "r") as f:
    smiles = f.readlines()

In [27]:
print(len(smiles), "->", len(set(smiles)))

934 -> 607
