# Exposure

This notebook contains the try-and-error steps of my `Exposure` class development.

## Imports

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path
import sys

from Bio.PDB import HSExposureCA, HSExposureCB, Selection, MMCIFParser
import pandas as pd

sys.path.append('../..')
from kissim.auxiliary import KlifsMoleculeLoader, PdbChainLoader, get_klifs_residues_mol2topdb
from kissim.encoding import ExposureFeature

_ColormakerRegistry()

In [4]:
pd.set_option('display.max_rows', 1000)

## IO paths

In [5]:
path_to_kinsim = Path('.') / '..' / '..'
path_to_data = Path('/') / 'home' / 'dominique' / 'Documents' / 'data' / 'kinsim' / '20190724_full'

path_to_results = path_to_kinsim / 'results'

metadata_path = path_to_data / 'preprocessed' / 'klifs_metadata_preprocessed.csv'

## Load metadata

In [6]:
klifs_metadata = pd.read_csv(metadata_path, index_col=0)
klifs_metadata.head()

Unnamed: 0,Unnamed: 0.1,metadata_index,kinase,family,groups,pdb_id,chain,alternate_model,species,ligand_orthosteric_name,...,ac_helix,rmsd1,rmsd2,qualityscore,pocket,resolution,missing_residues,missing_atoms,full_ifp,code
0,0,2886,AAK1,NAK,Other,4wsq,B,A,Human,K-252A,...,in,0.777,2.125,8.6,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.95,0,14,0000000000000010000001000000000000000000000000...,HUMAN/AAK1/4wsq_chainB_altA
1,1,10043,AAK1,NAK,Other,5l4q,A,A,Human,"~{N}-[5-(4-cyanophenyl)-1~{H}-pyrrolo[2,3-b]py...",...,in,0.78,2.137,9.7,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.97,0,3,0000000000000010000000000000000000000000000000...,HUMAN/AAK1/5l4q_chainA_altA
2,2,7046,AAK1,NAK,Other,5te0,A,-,Human,methyl (3Z)-3-{[(4-{methyl[(4-methylpiperazin-...,...,in,0.776,2.12,8.8,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.9,0,12,1000101000000010000001000000000000000000000000...,HUMAN/AAK1/5te0_chainA
3,3,843,ABL1,Abl,TK,2f4j,A,-,Human,CYCLOPROPANECARBOXYLIC ACID {4-[4-(4-METHYL-PI...,...,in,0.779,2.128,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.91,0,0,0000000000000010000001000000000000000000000000...,HUMAN/ABL1/2f4j_chainA
4,4,815,ABL1,Abl,TK,2g1t,A,-,Human,-,...,out,0.825,2.154,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.8,0,0,,HUMAN/ABL1/2g1t_chainA


## Test `HSExposureCB` and `HSExposureCA`

At the example of 6c83, which has GLY residues (per definition without CB atoms, e.g. **residue 136**) and residues with missing CB atoms (e.g. **residue 141**), I will test the behavior of both BioPython methods.

In [7]:
klifs_metadata[klifs_metadata.pdb_id == '6c83']

Unnamed: 0,Unnamed: 0.1,metadata_index,kinase,family,groups,pdb_id,chain,alternate_model,species,ligand_orthosteric_name,...,ac_helix,rmsd1,rmsd2,qualityscore,pocket,resolution,missing_residues,missing_atoms,full_ifp,code
251,254,9595,AurA,Aur,Other,6c83,B,-,Human,PHOSPHOMETHYLPHOSPHONIC ACID ADENYLATE ESTER,...,in,0.903,2.143,6.8,RPLGKGKFGNVYLLALKVLQLRREVEIQSHLRPNILRLYGYYLILE...,2.55,3,34,0000000000000010000000000000000000000000000000...,HUMAN/AurA/6c83_chainB


In [8]:
klifs_metadata.iloc[250]

Unnamed: 0.1                                                               256
metadata_index                                                            7381
kinase                                                                    AurA
family                                                                     Aur
groups                                                                   Other
pdb_id                                                                    6cpf
chain                                                                        A
alternate_model                                                              -
species                                                                  Human
ligand_orthosteric_name           PHOSPHOMETHYLPHOSPHONIC ACID ADENYLATE ESTER
ligand_orthosteric_pdb_id                                                  ACP
ligand_allosteric_name                                                       -
ligand_allosteric_pdb_id                            

In [9]:
# Load pdb file
pdb_id = '6c83'
parser = MMCIFParser()
structure = parser.get_structure(
    structure_id=pdb_id,
    filename=f'/home/dominique/Documents/data/kinsim/20190724_full/raw/PDB_download/{pdb_id}.cif'
)
model = structure[0]
chain = model['B']
residues = Selection.unfold_entities(entity_list=chain, target_level='R')



In [10]:
# Print residues in model - and compare with plain PDB file: correct!
len(residues)

248

In [11]:
residue = residues[0]

### GLY residues

In [12]:
[residue for residue in residues if residue.get_resname() == 'GLY']

[<Residue GLY het=  resseq=136 icode= >,
 <Residue GLY het=  resseq=140 icode= >,
 <Residue GLY het=  resseq=142 icode= >,
 <Residue GLY het=  resseq=145 icode= >,
 <Residue GLY het=  resseq=173 icode= >,
 <Residue GLY het=  resseq=198 icode= >,
 <Residue GLY het=  resseq=216 icode= >,
 <Residue GLY het=  resseq=265 icode= >,
 <Residue GLY het=  resseq=268 icode= >,
 <Residue GLY het=  resseq=291 icode= >,
 <Residue GLY het=  resseq=303 icode= >,
 <Residue GLY het=  resseq=316 icode= >,
 <Residue GLY het=  resseq=325 icode= >,
 <Residue GLY het=  resseq=355 icode= >]

In [13]:
residues_GLY_CA_vector = [residue['CA'].get_vector() for residue in residues if residue.get_resname() == 'GLY']
residues_GLY_CA_vector

[<Vector -8.03, 23.35, 20.43>,
 <Vector -0.11, 16.57, 15.27>,
 <Vector 3.05, 10.92, 16.51>,
 <Vector 3.16, 12.10, 21.98>,
 <Vector 15.76, 6.78, 28.10>,
 <Vector 6.89, 26.27, 30.96>,
 <Vector 3.68, 24.08, 10.32>,
 <Vector 6.34, 31.36, 12.07>,
 <Vector 7.36, 32.21, 7.29>,
 <Vector 15.27, 9.39, 10.93>,
 <Vector 30.69, 7.44, 18.29>,
 <Vector 25.05, 21.36, 4.14>,
 <Vector 14.55, 17.27, -2.54>,
 <Vector 27.14, 28.95, -6.01>]

### `HSExposureCB`

In [14]:
# Get CB-based exposure
exposures_cb = HSExposureCB(chain, 13)

In [15]:
# HSExposure saves pseudo-CB for GLY in ca_cb_list
exposures_cb.ca_cb_list

[(<Vector -8.03, 23.35, 20.43>, <Vector -7.41, 24.55, 19.89>),
 (<Vector -0.11, 16.57, 15.27>, <Vector 1.29, 16.93, 15.20>),
 (<Vector 3.05, 10.92, 16.51>, <Vector 4.32, 11.61, 16.65>),
 (<Vector 3.16, 12.10, 21.98>, <Vector 3.96, 12.97, 22.82>),
 (<Vector 15.76, 6.78, 28.10>, <Vector 16.99, 6.06, 27.86>),
 (<Vector 6.89, 26.27, 30.96>, <Vector 5.91, 26.96, 31.77>),
 (<Vector 3.68, 24.08, 10.32>, <Vector 2.98, 23.48, 9.19>),
 (<Vector 6.34, 31.36, 12.07>, <Vector 7.27, 32.35, 12.55>),
 (<Vector 7.36, 32.21, 7.29>, <Vector 7.28, 32.64, 5.91>),
 (<Vector 15.27, 9.39, 10.93>, <Vector 14.68, 9.17, 12.23>),
 (<Vector 30.69, 7.44, 18.29>, <Vector 30.64, 6.05, 18.71>),
 (<Vector 25.05, 21.36, 4.14>, <Vector 26.43, 20.92, 4.08>),
 (<Vector 14.55, 17.27, -2.54>, <Vector 13.85, 17.00, -3.78>),
 (<Vector 27.14, 28.95, -6.01>, <Vector 26.40, 30.20, -6.03>)]

In [16]:
for residue_ca, hse_ca in zip(residues_GLY_CA_vector, exposures_cb.ca_cb_list):
    print(residue_ca.get_array() == hse_ca[0].get_array())

[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]
[ True  True  True]


In [17]:
len(set([i[1][1] for i in exposures_cb.property_keys]))

242

In [18]:
# Residues that are listed in HSExposure but not in residues
set([i[1][1] for i in exposures_cb.property_keys]) - set([r.id[1] for r in residues])

set()

In [19]:
# Residues that are listed in residues but not in HSExposure
set([r.id[1] for r in residues]) - set([i[1][1] for i in exposures_cb.property_keys])

{131, 141, 143, 144, 253, 501}

In [20]:
# Missing CB
try:
    exposures_cb.property_dict[('B', (' ', 141, ' '))]
except KeyError as e:
    print(f'KeyError: {e}')

KeyError: ('B', (' ', 141, ' '))


In [21]:
# Missing CB but GLY
exposures_cb.property_dict[('B', (' ', 136, ' '))]

(16, 7, 0.0)

Great, GLY residues have CB value - but residue 141 does not!

### `HSExposureCA`

In [22]:
exposures_ca = HSExposureCA(chain, 13)

In [23]:
set([i[1][1] for i in exposures_ca.property_keys]) - set([r.id[1] for r in residues])

set()

In [24]:
set([r.id[1] for r in residues]) - set([i[1][1] for i in exposures_ca.property_keys])

{127, 275, 291, 388, 501}

In [25]:
len(exposures_ca.property_dict)

243

In [26]:
len(exposures_ca.ca_cb_list)

256

In [27]:
exposures_ca.ca_cb_list[:10]

[(<Vector 0.91, 26.11, 35.25>, <Vector 1.08, 26.61, 34.41>),
 (<Vector -1.70, 23.35, 35.18>, <Vector -1.78, 22.93, 36.09>),
 (<Vector -4.10, 21.71, 32.75>, <Vector -3.99, 21.00, 32.05>),
 (<Vector -7.06, 23.80, 33.94>, <Vector -7.94, 23.58, 34.36>),
 (<Vector -5.34, 27.04, 32.89>, <Vector -4.98, 27.82, 33.39>),
 (<Vector -5.51, 26.18, 29.16>, <Vector -4.61, 25.99, 28.77>),
 (<Vector -8.34, 25.89, 26.64>, <Vector -9.16, 26.46, 26.62>),
 (<Vector -7.54, 23.10, 24.18>, <Vector -7.18, 22.25, 24.55>),
 (<Vector -8.03, 23.35, 20.43>, <Vector -8.38, 24.25, 20.18>),
 (<Vector -8.03, 23.35, 20.43>, <Vector -7.41, 24.55, 19.89>)]

In [28]:
exposures_ca.property_dict[('B', (' ', 141, ' '))]

(6, 11, None)

In [29]:
exposures_ca.property_dict[('B', (' ', 136, ' '))]

(10, 13, 0.8124413253496995)

In [30]:
len(exposures_cb.ca_cb_list)

14

### Remove unneeded residues from chain

In [31]:
chain

<Chain id=B>

In [32]:
residues_pdb = Selection.unfold_entities(entity_list=chain, target_level='R')
len(residues_pdb)

248

In [33]:
residues_pdb

[<Residue GLN het=  resseq=127 icode= >,
 <Residue TRP het=  resseq=128 icode= >,
 <Residue ALA het=  resseq=129 icode= >,
 <Residue LEU het=  resseq=130 icode= >,
 <Residue GLU het=  resseq=131 icode= >,
 <Residue ASP het=  resseq=132 icode= >,
 <Residue PHE het=  resseq=133 icode= >,
 <Residue GLU het=  resseq=134 icode= >,
 <Residue ILE het=  resseq=135 icode= >,
 <Residue GLY het=  resseq=136 icode= >,
 <Residue ARG het=  resseq=137 icode= >,
 <Residue PRO het=  resseq=138 icode= >,
 <Residue LEU het=  resseq=139 icode= >,
 <Residue GLY het=  resseq=140 icode= >,
 <Residue LYS het=  resseq=141 icode= >,
 <Residue GLY het=  resseq=142 icode= >,
 <Residue LYS het=  resseq=143 icode= >,
 <Residue PHE het=  resseq=144 icode= >,
 <Residue GLY het=  resseq=145 icode= >,
 <Residue ASN het=  resseq=146 icode= >,
 <Residue VAL het=  resseq=147 icode= >,
 <Residue TYR het=  resseq=148 icode= >,
 <Residue LEU het=  resseq=149 icode= >,
 <Residue ALA het=  resseq=150 icode= >,
 <Residue ARG he

In [34]:
chain.detach_child((' ', 127, ' '))

In [35]:
residues_pdb

[<Residue GLN het=  resseq=127 icode= >,
 <Residue TRP het=  resseq=128 icode= >,
 <Residue ALA het=  resseq=129 icode= >,
 <Residue LEU het=  resseq=130 icode= >,
 <Residue GLU het=  resseq=131 icode= >,
 <Residue ASP het=  resseq=132 icode= >,
 <Residue PHE het=  resseq=133 icode= >,
 <Residue GLU het=  resseq=134 icode= >,
 <Residue ILE het=  resseq=135 icode= >,
 <Residue GLY het=  resseq=136 icode= >,
 <Residue ARG het=  resseq=137 icode= >,
 <Residue PRO het=  resseq=138 icode= >,
 <Residue LEU het=  resseq=139 icode= >,
 <Residue GLY het=  resseq=140 icode= >,
 <Residue LYS het=  resseq=141 icode= >,
 <Residue GLY het=  resseq=142 icode= >,
 <Residue LYS het=  resseq=143 icode= >,
 <Residue PHE het=  resseq=144 icode= >,
 <Residue GLY het=  resseq=145 icode= >,
 <Residue ASN het=  resseq=146 icode= >,
 <Residue VAL het=  resseq=147 icode= >,
 <Residue TYR het=  resseq=148 icode= >,
 <Residue LEU het=  resseq=149 icode= >,
 <Residue ALA het=  resseq=150 icode= >,
 <Residue ARG he

In [36]:
residues = Selection.unfold_entities(entity_list=chain, target_level='R')
residues

[<Residue TRP het=  resseq=128 icode= >,
 <Residue ALA het=  resseq=129 icode= >,
 <Residue LEU het=  resseq=130 icode= >,
 <Residue GLU het=  resseq=131 icode= >,
 <Residue ASP het=  resseq=132 icode= >,
 <Residue PHE het=  resseq=133 icode= >,
 <Residue GLU het=  resseq=134 icode= >,
 <Residue ILE het=  resseq=135 icode= >,
 <Residue GLY het=  resseq=136 icode= >,
 <Residue ARG het=  resseq=137 icode= >,
 <Residue PRO het=  resseq=138 icode= >,
 <Residue LEU het=  resseq=139 icode= >,
 <Residue GLY het=  resseq=140 icode= >,
 <Residue LYS het=  resseq=141 icode= >,
 <Residue GLY het=  resseq=142 icode= >,
 <Residue LYS het=  resseq=143 icode= >,
 <Residue PHE het=  resseq=144 icode= >,
 <Residue GLY het=  resseq=145 icode= >,
 <Residue ASN het=  resseq=146 icode= >,
 <Residue VAL het=  resseq=147 icode= >,
 <Residue TYR het=  resseq=148 icode= >,
 <Residue LEU het=  resseq=149 icode= >,
 <Residue ALA het=  resseq=150 icode= >,
 <Residue ARG het=  resseq=151 icode= >,
 <Residue GLU he

## Get exposure from `kissim.encoding.ExposureFeature`

### Load molecule

In [37]:
klifs_metadata_entry = klifs_metadata.iloc[250]
ml = KlifsMoleculeLoader(klifs_metadata_entry=klifs_metadata_entry)
molecule = ml.molecule
molecule.df.head()

Unnamed: 0,atom_id,atom_name,res_id,res_name,subst_name,x,y,z,charge,klifs_id
0,1,N,137,ARG,ARG137,5.125,15.9379,53.4452,0.0,1
1,2,CA,137,ARG,ARG137,4.9029,14.6385,52.8172,0.0,1
2,3,C,137,ARG,ARG137,4.9292,14.7761,51.2956,0.0,1
3,4,O,137,ARG,ARG137,4.668,15.8556,50.7577,0.0,1
4,5,CB,137,ARG,ARG137,3.5705,14.045,53.262,0.0,1


In [38]:
molecule.code

'HUMAN/AurA_6cpf_chainA'

### Load PDB chain

In [39]:
cl = PdbChainLoader(klifs_metadata_entry=klifs_metadata_entry)
chain = cl.chain
chain

<Chain id=A>

### Call `ExposureFeature` class

In [40]:
exposure_feature = ExposureFeature()
exposure_feature.from_molecule(molecule, chain)

In [41]:
exposure_feature.features

Unnamed: 0_level_0,exposure
klifs_id,Unnamed: 1_level_1
1,0.095238
2,0.25
3,0.75
4,0.727273
5,0.095238
6,0.857143
7,0.380952
8,0.7
9,0.741935
10,0.290323
