In [18]:
import pandas as pd
from pathlib import Path
import rdkit.Chem as Chem
from tqdm import tqdm
import requests

from torch_geometric.data import HeteroData
import itertools

In [19]:
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")

In [20]:
RAW_DATA = Path() / "data" / "raw"

#### Read Kindodata data

In [21]:
df = pd.read_csv(RAW_DATA / "data_clean.csv", index_col="ident")
df.columns

Index(['activities.standard_type', 'activities.standard_units',
       'activities.standard_value', 'assays.chembl_id',
       'assays.confidence_score', 'component_sequences.sequence',
       'compound_structures.canonical_smiles',
       'compound_structures.standard_inchi', 'docking.posit_probability',
       'docking.runtime', 'docking.score', 'docking.similar_kinase_chain',
       'docking.similar_kinase_pdb', 'docking.similar_ligand_pdb',
       'docs.authors', 'docs.chembl_id', 'docs.klifs_kinase_id',
       'docs.klifs_structure_id', 'docs.year', 'molecule_dictionary.chembl_id',
       'molecule_dictionary.max_phase', 'structure_ID',
       'target_dictionary.chembl_id', 'target_dictionary.kinase',
       'target_dictionary.species', 'target_dictionary.uniprot_id'],
      dtype='object')

#### Assign ligand PDB filepaths

In [22]:
ligand_pdb_files = {
    int(fp.stem.split("_")[0]): fp for fp in (RAW_DATA / "pdbs" / "ligand").iterdir()
}
next(iter(ligand_pdb_files.items())), len(ligand_pdb_files)

((23120667, PosixPath('data/raw/pdbs/ligand/23120667_ligand.pdb')), 155718)

In [23]:
df["ligand_pdb_file"] = [ligand_pdb_files[ident] if ident in ligand_pdb_files else None for ident in df.index]

# sanity check
assert df["ligand_pdb_file"].notna().all()
df["ligand_pdb_file"].head()

ident
16291323    data/raw/pdbs/ligand/16291323_ligand.pdb
16306943    data/raw/pdbs/ligand/16306943_ligand.pdb
16264754    data/raw/pdbs/ligand/16264754_ligand.pdb
16340050    data/raw/pdbs/ligand/16340050_ligand.pdb
16340956    data/raw/pdbs/ligand/16340956_ligand.pdb
Name: ligand_pdb_file, dtype: object

#### Get pocket info from klifs

In [24]:
# NOTE some rows are missing a structure ID
df["structure_ID"].notna().all()

False

In [25]:
# drop those
df = df[df["structure_ID"].notna()]

In [26]:
# for some reason pandas reads structure ID as a float
df["structure_ID"] = df["structure_ID"].astype(int)

In [27]:
# get pocket mol2 files
pbar = tqdm(df.iterrows(), total=len(df))
for ident, row in pbar:
    structure_id = row["structure_ID"]
    fp = RAW_DATA / "mol2" / "pocket" / f"{structure_id}_pocket.mol2"
    if fp.exists():
        continue
    resp = requests.get(
        "https://klifs.net/api/structure_get_pocket", params={"structure_ID": structure_id}
    )
    if resp.ok:
        fp.write_bytes(resp.content)

100%|██████████| 110274/110274 [00:08<00:00, 13249.10it/s]


In [28]:
pocket_mol2_files = {
    int(fp.stem.split("_")[0]): fp for fp in (RAW_DATA / "mol2" / "pocket").iterdir()
}
df["pocket_mol2_file"] = [pocket_mol2_files[row["structure_ID"]] for _,row in df.iterrows()]

In [29]:
df["pocket_mol2_file"].head()

ident
16291323    data/raw/mol2/pocket/12757_pocket.mol2
16306943    data/raw/mol2/pocket/14314_pocket.mol2
16264754    data/raw/mol2/pocket/10689_pocket.mol2
16340050    data/raw/mol2/pocket/10689_pocket.mol2
16340956    data/raw/mol2/pocket/10689_pocket.mol2
Name: pocket_mol2_file, dtype: object

#### Create PyTorch Geometric data
this might take a while...

In [30]:
data_list = []
for ident, row in tqdm(df.iterrows(), total=len(df)):
    data = HeteroData()
    ligand = Chem.MolFromPDBFile(str(row["ligand_pdb_file"]))
    assert ligand is not None
    pocket = Chem.rdmolfiles.MolFromMol2File(str(row["pocket_mol2_file"]))
    assert pocket is not None
    # TODO
    ...

  1%|          | 849/110274 [00:51<1:50:56, 16.44it/s]


AssertionError: 