In [5]:
import kinodata
from kinodata.data import KinodataDocked, Filtered
from kinodata.data.data_module import create_dataset
from kinodata.data.grouped_split import KinodataKFoldSplit
from kinodata.transform import TransformToComplexGraph, FilterDockingRMSD
from kinodata.types import *


import json
from pathlib import Path
from typing import Any

import torch

import kinodata.configuration as cfg
from kinodata.model import ComplexTransformer, DTIModel, RegressionModel
from kinodata.model.complex_transformer import make_model as make_complex_transformer
from kinodata.model.dti import make_model as make_dti_baseline
from kinodata.data.data_module import make_kinodata_module
from kinodata.transform import TransformToComplexGraph

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import tqdm

!wandb disabled

W&B disabled.


### Grab some example complex data object

In [6]:
dataset = KinodataDocked()

In [7]:
demo_data = dataset[123]

### Get pocket mol2 file for data object and read it

In [8]:
# for access to mol2 file via ident 
df = dataset.df

Reading data frame from /Users/joschka/projects/kinodata-3D-affinity-prediction/data/raw/kinodata_docked_v2.sdf.gz...
Deduping data frame (current size: 121913)...
119713 complexes remain after deduplication.
Checking for missing pocket mol2 files...


100%|██████████| 3244/3244 [00:00<00:00, 18190.90it/s]


Adding pocket sequences...
(119713, 25)


100%|██████████| 119713/119713 [00:00<00:00, 2107362.00it/s]


Exiting with 3552 cached sequences.
(119713, 26)


In [9]:
from kinodata.data.io.read_klifs_mol2 import read_klifs_mol2
pocket_file = df[df["ident"] == demo_data.ident.item()]["pocket_mol2_file"].values[0]
pocket_df = read_klifs_mol2(pocket_file, with_bonds=False)
pocket_df

Unnamed: 0,atom.id,atom.name,atom.x,atom.y,atom.z,atom.type,residue.subst_id,residue.subst_name,atom.charge,atom.status_bit
0,1,N,5.9749,17.553101,54.301102,N.3,1,ASP828,0.0,BACKBONE
1,2,H,6.2335,18.182800,53.554901,H,1,ASP828,0.0,BACKBONE
2,3,CA,5.7827,16.144400,54.006401,C.3,1,ASP828,0.0,BACKBONE
3,4,HA,6.3715,15.523400,54.681599,H,1,ASP828,0.0,BACKBONE
4,5,C,6.2202,15.921500,52.568699,C.2,1,ASP828,0.0,BACKBONE
...,...,...,...,...,...,...,...,...,...,...
1242,1243,CB,4.3750,20.562901,22.260099,C.3,79,SER986,0.0,
1243,1244,HB2,4.8093,20.774599,21.283100,H,79,SER986,0.0,
1244,1245,HB3,3.3008,20.410601,22.154900,H,79,SER986,0.0,
1245,1246,OG,4.9719,19.396900,22.806101,O.3,79,SER986,0.0,


#### must remove hydrogens to match complex representation

In [10]:
non_hydrogen = pocket_df["atom.type"] != "H"

#### try matching

In [11]:
pocket_df[non_hydrogen][["atom.x", "atom.y", "atom.z"]]

Unnamed: 0,atom.x,atom.y,atom.z
0,5.9749,17.553101,54.301102
2,5.7827,16.144400,54.006401
4,6.2202,15.921500,52.568699
5,6.3756,16.881701,51.810699
6,4.3004,15.748500,54.201900
...,...,...,...
1238,4.6357,21.720501,23.194500
1240,3.8439,22.908701,22.780500
1241,2.6525,22.967800,23.047199
1242,4.3750,20.562901,22.260099


In [14]:
demo_data["pocket"].pos.shape

torch.Size([603, 3])

In [15]:
demo_data["pocket"].pos

tensor([[ 5.9749, 17.5531, 54.3011],
        [ 5.7827, 16.1444, 54.0064],
        [ 6.2202, 15.9215, 52.5687],
        ...,
        [ 2.6525, 22.9678, 23.0472],
        [ 4.3750, 20.5629, 22.2601],
        [ 4.9719, 19.3969, 22.8061]])

In [17]:
# match?
pocket_df[non_hydrogen][["atom.x", "atom.y", "atom.z"]].values == demo_data["pocket"].pos.numpy()

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       ...,
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])