# Dataset fingerprints

## Imports

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import pandas as pd

from kinsim_structure.encoding import Fingerprint
from kinsim_structure.similarity import calculate_similarity

## IO paths

In [4]:
path_to_data = Path('/') / 'home' / 'dominique' / 'Documents' / 'data' / 'kinsim' / '20190724_full'
path_to_kinsim = Path('/') / 'home' / 'dominique' / 'Documents' / 'projects' / 'kinsim_structure'
path_to_results = path_to_kinsim / 'results'

metadata_path = path_to_data / 'postprocessed' / 'klifs_metadata_postprocessed.csv'

## Load metadata for failed fingerprint generation

In [5]:
with open(path_to_results / 'fingerprints_error_entries.p', 'rb') as f:
    errors = pickle.load(f)
    
errors = pd.DataFrame(errors)

In [6]:
len(errors)

3

In [7]:
errors

Unnamed: 0.1,Unnamed: 0,index,kinase,family,groups,pdb_id,chain,alternate_model,species,ligand_orthosteric_name,...,dfg,ac_helix,rmsd1,rmsd2,qualityscore,pocket,resolution,missing_residues,missing_atoms,full_ifp
1000,1005,1259,CHK1,CAMKL,CAMK,4hyh,A,B,Human,"2-(6-methoxy-1-oxo-1,3-dihydro-2H-isoindol-2-y...",...,in,in,0.779,2.097,8.0,QTLGEGAYGEVQLVAVKIVNIKKEICINKMLNENVVKFYGHYLFLE...,1.7,0,0,0000000000000010000000000000000000000000000000...
1002,1007,1317,CHK1,CAMKL,CAMK,4jik,A,-,Human,2-(4-chlorophenyl)-8-[(3S)-piperidin-3-ylamino...,...,in,in,0.779,2.028,8.7,QTLGEGA_GEVQLVAVKIVNIKKEICINKMLNENVVKFYGHYLFLE...,1.9,1,9,0000000000000010000000000000100000010000000000...
2479,2490,747,LOK,STE20,STE,2j7t,A,A,Human,"(3Z)-N-(3-CHLOROPHENYL)-3-({3,5-DIMETHYL-4-[(4...",...,in,in,0.776,2.105,8.0,GELGDGAFGKVYKAAAKVIDYIVEIEILATCDPYIVKLLGAWIMIE...,2.0,0,26,0000000000000010000000000000000000000000000000...


## Load fingerprints

In [8]:
fingerprints = pd.read_csv(path_to_results / 'fingerprints_parallelized.csv')

In [9]:
fingerprints.shape

(332690, 15)

In [10]:
(3917- len(errors)) * 85

332690

In [11]:
fingerprints.head()

Unnamed: 0.1,Unnamed: 0,size,hbd,hba,charge,aromatic,aliphatic,sco,exposure,distance_to_centroid,distance_to_hinge_region,distance_to_dfg_region,distance_to_front_pocket,metadata_index,molecule_code
0,1,2.0,0.0,2.0,-1.0,0.0,0.0,94.16,0.296296,17.19,12.8,19.08,14.13,0,HUMAN/AAK1_4wsq_altA_chainB
1,2,1.0,0.0,0.0,0.0,0.0,1.0,21.29,0.344828,15.09,11.64,16.48,11.69,0,HUMAN/AAK1_4wsq_altA_chainB
2,3,2.0,0.0,0.0,0.0,0.0,1.0,27.98,0.75,12.41,9.24,15.13,8.24,0,HUMAN/AAK1_4wsq_altA_chainB
3,4,1.0,0.0,0.0,0.0,0.0,1.0,22.51,0.451613,11.8,10.34,13.67,7.22,0,HUMAN/AAK1_4wsq_altA_chainB
4,5,2.0,0.0,2.0,-1.0,0.0,0.0,25.87,0.142857,12.8,12.41,12.79,9.24,0,HUMAN/AAK1_4wsq_altA_chainB


## Example fingerprint

In [12]:
fp = Fingerprint()

In [13]:
fingerprint = fingerprints[fingerprints.molecule_code=='HUMAN/AAK1_4wsq_altA_chainB']

In [14]:
f1 = fingerprint.iloc[:, [1,2,3,4,5,6,7,8,9,10,11,12]]

In [15]:
f2 = f1*1.53

In [16]:
f1.values.flatten()

array([ 2.  ,  0.  ,  2.  , ..., 16.37,  9.29, 15.4 ])

In [17]:
calculate_similarity(f1, f2)

<class 'numpy.ndarray'>


0.1835090200761565

In [18]:
fp.features = fingerprint

In [19]:
fp.normalize_physchem_distances()

Unnamed: 0.1,Unnamed: 0,size,hbd,hba,charge,aromatic,aliphatic,sco,exposure,distance_to_centroid,distance_to_hinge_region,distance_to_dfg_region,distance_to_front_pocket,metadata_index,molecule_code
0,1,0.5,0.000000,1.0,0.0,0.0,0.0,0.523111,0.296296,0.491143,0.365714,0.545143,0.403714,0,HUMAN/AAK1_4wsq_altA_chainB
1,2,0.0,0.000000,0.0,0.5,0.0,1.0,0.118278,0.344828,0.431143,0.332571,0.470857,0.334000,0,HUMAN/AAK1_4wsq_altA_chainB
2,3,0.5,0.000000,0.0,0.5,0.0,1.0,0.155444,0.750000,0.354571,0.264000,0.432286,0.235429,0,HUMAN/AAK1_4wsq_altA_chainB
3,4,0.0,0.000000,0.0,0.5,0.0,1.0,0.125056,0.451613,0.337143,0.295429,0.390571,0.206286,0,HUMAN/AAK1_4wsq_altA_chainB
4,5,0.5,0.000000,1.0,0.0,0.0,0.0,0.143722,0.142857,0.365714,0.354571,0.365429,0.264000,0,HUMAN/AAK1_4wsq_altA_chainB
5,6,0.0,0.000000,0.0,0.5,0.0,0.0,,0.434783,0.330286,0.367143,0.321429,0.241714,0,HUMAN/AAK1_4wsq_altA_chainB
6,7,0.0,0.000000,0.0,0.5,0.0,0.0,,0.692308,0.333429,0.405714,0.273429,0.296571,0,HUMAN/AAK1_4wsq_altA_chainB
7,8,1.0,0.000000,0.0,0.5,1.0,0.0,0.496000,0.500000,0.387143,0.437714,0.282571,0.369143,0,HUMAN/AAK1_4wsq_altA_chainB
8,9,0.0,0.000000,0.0,0.5,0.0,1.0,0.128056,0.740741,0.316857,0.341714,0.229143,0.306857,0,HUMAN/AAK1_4wsq_altA_chainB
9,10,0.5,0.000000,0.0,0.5,0.0,1.0,0.092111,0.419355,0.325714,0.300857,0.290571,0.282000,0,HUMAN/AAK1_4wsq_altA_chainB


In [20]:
fp_usr = fp.get_usr_fingerprint()

In [21]:
m1 = fp_usr['moments']

In [22]:
m2 = m1 + 1 

In [23]:
p1 = fp_usr['physchem']

In [24]:
p2 = p1 + 2