# KinSim tutorial

## Imports

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path

import pandas as pd

from kinsim_structure.auxiliary import KlifsMoleculeLoader, PdbChainLoader
from kinsim_structure.encoding import Fingerprint, PhysicoChemicalFeatures, SpatialFeatures
from kinsim_structure.encoding import PharmacophoreSizeFeatures, SideChainOrientationFeature, ExposureFeature

In [4]:
pd.set_option('display.max_rows', 100)

## IO paths

In [5]:
path_to_data = Path('/') / 'home' / 'dominique' / 'Documents' / 'data' / 'kinsim' / '20190724_full'
path_to_kinsim = Path('/') / 'home' / 'dominique' / 'Documents' / 'projects' / 'kinsim_structure'
path_to_results = path_to_kinsim / 'results'

metadata_path = path_to_data / 'postprocessed' / 'klifs_metadata_postprocessed.csv'

## Load metadata

In [6]:
klifs_metadata = pd.read_csv(metadata_path, index_col=0)

In [7]:
klifs_metadata.head()

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,metadata_index,kinase,family,groups,pdb_id,chain,alternate_model,species,...,ac_helix,rmsd1,rmsd2,qualityscore,pocket,resolution,missing_residues,missing_atoms,full_ifp,code
0,0,0,2886,AAK1,NAK,Other,4wsq,B,A,Human,...,in,0.777,2.125,8.6,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.95,0,14,0000000000000010000001000000000000000000000000...,HUMAN/AAK1/4wsq_chainB_altA
1,1,1,10043,AAK1,NAK,Other,5l4q,A,A,Human,...,in,0.78,2.137,9.7,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.97,0,3,0000000000000010000000000000000000000000000000...,HUMAN/AAK1/5l4q_chainA_altA
2,2,2,7046,AAK1,NAK,Other,5te0,A,-,Human,...,in,0.776,2.12,8.8,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.9,0,12,1000101000000010000001000000000000000000000000...,HUMAN/AAK1/5te0_chainA
3,3,3,843,ABL1,Abl,TK,2f4j,A,-,Human,...,in,0.779,2.128,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.91,0,0,0000000000000010000001000000000000000000000000...,HUMAN/ABL1/2f4j_chainA
4,4,4,815,ABL1,Abl,TK,2g1t,A,-,Human,...,out,0.825,2.154,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.8,0,0,,HUMAN/ABL1/2g1t_chainA


## Load example molecule

In [8]:
ix = klifs_metadata[klifs_metadata.pdb_id == '3w32'].index[0]
ix

1490

In [9]:
klifs_metadata_entry = klifs_metadata.loc[ix]
klifs_metadata_entry

Unnamed: 0.1                                                              1503
Unnamed: 0.1.1                                                            1510
metadata_index                                                             604
kinase                                                                    EGFR
family                                                                    EGFR
groups                                                                      TK
pdb_id                                                                    3w32
chain                                                                        A
alternate_model                                                              -
species                                                                  Human
ligand_orthosteric_name      4-({3-chloro-4-[3-(trifluoromethyl)phenoxy]phe...
ligand_orthosteric_pdb_id                                                  W32
ligand_allosteric_name                              

### By metadata entry

In [10]:
KlifsMoleculeLoader?

In [11]:
klifs_molecule_loader = KlifsMoleculeLoader(klifs_metadata_entry=klifs_metadata_entry)

In [12]:
molecule1 = klifs_molecule_loader.molecule

### By mol2 file

In [13]:
mol2_path = '/home/dominique/Documents/data/kinsim/20190724_full/raw/KLIFS_download/HUMAN/EGFR/3w32_chainA/pocket.mol2'

In [14]:
klifs_molecule_loader = KlifsMoleculeLoader(mol2_path=mol2_path)

In [15]:
molecule2 = klifs_molecule_loader.molecule

## Load example chain

In [16]:
pdb_chain_loader = PdbChainLoader(klifs_metadata_entry=klifs_metadata_entry)

In [17]:
chain = pdb_chain_loader.chain

In [18]:
type(chain)

Bio.PDB.Chain.Chain

### Compare both loading methods

In [19]:
all(molecule1.df == molecule2.df)

True

In [20]:
molecule = molecule1

## Fingerprint

### From molecule and chain

In [21]:
fp = Fingerprint()
fp.from_molecule(molecule, chain)

In [22]:
fp.molecule_code

'HUMAN/EGFR_3w32_chainA'

In [23]:
fp.fingerprint_type1.shape

(85, 12)

In [24]:
fp.fingerprint_type1

Unnamed: 0,size,hbd,hba,charge,aromatic,aliphatic,sco,exposure,distance_to_centroid,distance_to_hinge_region,distance_to_dfg_region,distance_to_front_pocket
1,2,1,0,1,0,0,64.96,0.37931,17.65,13.26,20.08,14.66
2,1,0,0,0,0,1,20.24,0.217391,15.9,12.39,17.93,12.35
3,2,0,0,0,0,1,27.12,0.83871,13.55,10.46,16.78,9.1
4,1,0,0,0,0,0,,0.25,13.47,11.84,15.67,8.76
5,1,1,1,0,0,0,15.62,0.136364,13.3,12.99,13.85,9.48
6,1,0,0,0,0,0,,0.730769,12.24,13.5,12.32,8.97
7,1,0,0,0,0,1,17.88,0.647059,13.45,15.79,11.74,11.54
8,3,0,0,0,1,0,87.36,0.55,11.2,13.71,8.13,10.96
9,1,0,0,0,0,0,,0.666667,11.3,12.22,8.96,10.42
10,1,1,1,1,0,1,20.76,0.285714,11.71,10.92,11.21,9.67


### From metadata entry

In [25]:
klifs_metadata_entry.pdb_id

'3w32'

In [26]:
fp.from_metadata_entry(klifs_metadata_entry)

In [27]:
fp.fingerprint_type2

{'physchem':     size  hbd  hba  charge  aromatic  aliphatic     sco  exposure
 1      2    1    0       1         0          0   64.96  0.379310
 2      1    0    0       0         0          1   20.24  0.217391
 3      2    0    0       0         0          1   27.12  0.838710
 4      1    0    0       0         0          0     NaN  0.250000
 5      1    1    1       0         0          0   15.62  0.136364
 6      1    0    0       0         0          0     NaN  0.730769
 7      1    0    0       0         0          1   17.88  0.647059
 8      3    0    0       0         1          0   87.36  0.550000
 9      1    0    0       0         0          0     NaN  0.666667
 10     1    1    1       1         0          1   20.76  0.285714
 11     1    0    0       0         0          1   17.34  0.500000
 12     3    1    1       0         1          0   95.80  0.441176
 13     2    1    0       1         0          0  101.63  0.404762
 14     1    0    0       0         0          1  

## Pharmacophore and size features

In [28]:
ps = PharmacophoreSizeFeatures()
ps.from_molecule(molecule)

In [29]:
ps.features

Unnamed: 0_level_0,size,hbd,hba,charge,aromatic,aliphatic
klifs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2,1,0,1,0,0
2,1,0,0,0,0,1
3,2,0,0,0,0,1
4,1,0,0,0,0,0
5,1,1,1,0,0,0
6,1,0,0,0,0,0
7,1,0,0,0,0,1
8,3,0,0,0,1,0
9,1,0,0,0,0,0
10,1,1,1,1,0,1


## Exposure

In [30]:
ex = ExposureFeature()
ex.from_molecule(molecule, chain)

In [31]:
ex.features

Unnamed: 0_level_0,exposure
klifs_id,Unnamed: 1_level_1
1,0.37931
2,0.217391
3,0.83871
4,0.25
5,0.136364
6,0.730769
7,0.647059
8,0.55
9,0.666667
10,0.285714


In [32]:
ex.from_molecule(molecule, chain, verbose=True)

In [33]:
ex.features

Unnamed: 0_level_0,res_id,ca_up,ca_down,ca_angle_Ca-Cb_Ca-pCb,ca_exposure,cb_up,cb_down,cb_angle_Ca-Cb_Ca-pCb,cb_exposure,exposure
klifs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,716,8.0,21.0,0.505348,0.275862,11.0,18.0,0.0,0.37931,0.37931
2,717,7.0,16.0,0.579169,0.304348,5.0,18.0,0.0,0.217391,0.217391
3,718,17.0,14.0,0.846535,0.548387,26.0,5.0,0.0,0.83871,0.83871
4,719,4.0,20.0,0.796797,0.166667,6.0,18.0,0.0,0.25,0.25
5,720,8.0,14.0,0.50981,0.363636,3.0,19.0,0.0,0.136364,0.136364
6,721,12.0,14.0,0.811757,0.461538,19.0,7.0,0.0,0.730769,0.730769
7,722,12.0,22.0,0.937182,0.352941,22.0,12.0,0.0,0.647059,0.647059
8,723,27.0,13.0,0.699716,0.675,22.0,18.0,0.0,0.55,0.55
9,724,13.0,20.0,1.14491,0.393939,22.0,11.0,0.0,0.666667,0.666667
10,725,7.0,21.0,0.372451,0.25,8.0,20.0,0.0,0.285714,0.285714


## Side chain orientation

In [34]:
sco = SideChainOrientationFeature()
sco.from_molecule(molecule, chain)

In [35]:
sco.features

Unnamed: 0_level_0,sco
klifs_id,Unnamed: 1_level_1
1,64.96
2,20.24
3,27.12
4,
5,15.62
6,
7,17.88
8,87.36
9,
10,20.76


In [36]:
sco.from_molecule(molecule, chain, fill_missing=True)

In [37]:
sco.features

Unnamed: 0_level_0,sco
klifs_id,Unnamed: 1_level_1
1,64.96
2,20.24
3,27.12
4,0.0
5,15.62
6,0.0
7,17.88
8,87.36
9,0.0
10,20.76


In [38]:
sco.from_molecule(molecule, chain, fill_missing=True, verbose=True)

In [39]:
sco.features

Unnamed: 0,klifs_id,residue_id,residue_name,ca,cb,com,sco
0,1,716,LYS,"<Vector 15.43, 46.39, 14.62>","<Vector 15.02, 46.24, 16.09>","<Vector 15.00, 47.16, 15.75>",64.96
1,2,717,VAL,"<Vector 18.68, 44.44, 14.36>","<Vector 20.11, 45.01, 14.45>","<Vector 19.25, 44.57, 14.73>",20.24
2,3,718,LEU,"<Vector 18.80, 41.07, 16.07>","<Vector 17.94, 39.85, 15.72>","<Vector 18.06, 40.57, 16.07>",27.12
3,4,719,GLY,"<Vector 22.39, 39.99, 15.28>",,"<Vector 22.16, 39.70, 14.38>",0.0
4,5,720,SER,"<Vector 25.09, 38.81, 12.86>","<Vector 26.26, 39.74, 12.49>","<Vector 25.46, 38.86, 13.08>",15.62
5,6,721,GLY,"<Vector 26.60, 35.40, 12.11>",,"<Vector 26.95, 35.86, 11.29>",0.0
6,7,722,ALA,"<Vector 28.98, 33.64, 9.72>","<Vector 29.47, 32.21, 9.87>","<Vector 28.76, 33.61, 9.24>",17.88
7,8,723,PHE,"<Vector 26.36, 33.45, 6.96>","<Vector 25.48, 32.21, 6.77>","<Vector 26.49, 31.54, 6.84>",87.36
8,9,724,GLY,"<Vector 24.59, 36.79, 7.47>",,"<Vector 24.56, 36.70, 8.43>",0.0
9,10,725,THR,"<Vector 22.60, 39.31, 9.54>","<Vector 23.05, 40.76, 9.32>","<Vector 22.46, 39.97, 9.33>",20.76


## Spatial features

In [67]:
space = SpatialFeatures()
space.from_molecule(molecule)

In [68]:
space.features

Unnamed: 0_level_0,distance_to_centroid,distance_to_hinge_region,distance_to_dfg_region,distance_to_front_pocket
klifs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,17.65,13.26,20.08,14.66
2,15.9,12.39,17.93,12.35
3,13.55,10.46,16.78,9.1
4,13.47,11.84,15.67,8.76
5,13.3,12.99,13.85,9.48
6,12.24,13.5,12.32,8.97
7,13.45,15.79,11.74,11.54
8,11.2,13.71,8.13,10.96
9,11.3,12.22,8.96,10.42
10,11.71,10.92,11.21,9.67


In [69]:
space.reference_points

Unnamed: 0,centroid,hinge_region,dfg_region,front_pocket
x,1.164075,2.0429,7.6965,-0.618733
y,20.801587,21.301233,20.543633,15.5668
z,36.200677,41.536,32.303333,39.087833


In [70]:
space.get_anchor_atoms(molecule)

{'hinge_region':          x        y        z
 16  7.7156  20.2023  43.4006
 47  0.1417  22.8441  46.1515
 80 -1.7286  20.8573  35.0559, 'dfg_region':           x        y        z
 19  12.1579  15.7003  35.9584
 24   9.5766  26.8005  27.5058
 81   1.3550  19.1301  33.4458, 'front_pocket':          x        y        z
 6   5.0947   9.2101  36.3726
 48 -3.4100  21.6235  45.5136
 75 -3.5409  15.8668  35.3773}

In [78]:
space.save_cgo_refpoints(
    klifs_metadata.iloc[1490],
    path_to_results / 'reference_points'
)