# Min-max cutoffs for continuous (=spatial) features

We define the min-max cutoffs for the min-max normalization of continuous features such as the distances and moments features: Use the floor/ceiling values of the minimum/maximum values of the distances w.r.t. to each subpocket center.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from kissim.encoding import FingerprintGenerator



In [2]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / "../../results/"

## Load fingerprints

In [3]:
fingerprint_generator = FingerprintGenerator.from_json(RESULTS / "fingerprints_clean.json")
print(f"Number of fingerprints: {len(fingerprint_generator.data)}")
# NBVAL_CHECK_OUTPUT

Number of fingerprints: 5222


## Distances features

In [4]:
features_d = fingerprint_generator.distances_exploded()
features_d

Unnamed: 0_level_0,Unnamed: 1_level_0,hinge_region,dfg_region,front_pocket,center
structure_klifs_id,residue_ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3833,1,12.797916,19.079002,14.099449,17.262196
3833,2,11.639592,16.480698,11.631843,15.175067
3833,3,9.241100,15.125705,8.280284,12.448807
3833,4,10.344558,13.665124,7.234079,11.854320
3833,5,12.414777,12.788165,9.115113,12.904052
...,...,...,...,...,...
7219,81,8.892165,7.109349,6.723064,3.793342
7219,82,11.366709,6.088001,10.422721,6.788561
7219,83,13.247789,5.919116,11.564625,9.319808
7219,84,16.129894,9.708284,13.088131,11.473628


In [5]:
features_d_stats = features_d.describe(percentiles=[0.001, 0.01, 0.99, 0.999])
features_d_stats

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
count,436060.0,431769.0,436162.0,436886.0
mean,12.873753,13.574359,13.034014,12.005602
std,4.565766,4.860071,4.283397,3.482221
min,2.373261,0.873535,1.355564,0.890727
0.1%,3.70272,3.586912,4.528594,2.315035
1%,4.110608,4.646228,5.453957,3.05613
50%,12.564068,13.298356,12.452536,12.017167
99%,23.064425,26.701407,23.43188,21.186535
99.9%,25.08404,29.263488,25.884926,23.517875
max,30.791023,33.760345,33.15451,28.318304


In [6]:
features_d_stats.loc[["min", "max"], :]

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
min,2.373261,0.873535,1.355564,0.890727
max,30.791023,33.760345,33.15451,28.318304


In [7]:
distance_cutoff = pd.concat(
    [np.floor(features_d_stats.loc["min", :]), np.ceil(features_d_stats.loc["max", :])], axis=1
).transpose()
distance_cutoff

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
min,2.0,0.0,1.0,0.0
max,31.0,34.0,34.0,29.0


Format as dictionary as needed for `kissim`: 
https://github.com/volkamerlab/kissim/blob/master/kissim/definitions.py

In [8]:
{subpocket_name: tuple(values) for subpocket_name, values in distance_cutoff.items()}

{'hinge_region': (2.0, 31.0),
 'dfg_region': (0.0, 34.0),
 'front_pocket': (1.0, 34.0),
 'center': (0.0, 29.0)}

## Moments features

In [9]:
features_m = fingerprint_generator.moments_exploded()
features_m = features_m.stack().unstack(level=1).reset_index(drop=True)

In [10]:
features_m_stats = features_m.describe(percentiles=[0.001, 0.01, 0.99, 0.999])
features_m_stats

moment,1,2,3
count,20800.0,20800.0,20800.0
mean,12.870258,4.283153,2.780543
std,0.617352,0.557114,1.071235
min,11.206953,2.838044,-2.786872
0.1%,11.557383,3.036482,-2.113185
1%,11.679099,3.259589,-1.696522
50%,12.922318,4.422952,2.884637
99%,14.196597,5.283267,4.708257
99.9%,14.835826,5.577127,5.655943
max,16.848968,6.377253,6.178007


In [11]:
moment_cutoff = pd.concat(
    [np.floor(features_m_stats.loc["min", :]), np.ceil(features_m_stats.loc["max", :])], axis=1
).transpose()
moment_cutoff

moment,1,2,3
min,11.0,2.0,-3.0
max,17.0,7.0,7.0


Format as dictionary as needed for `kissim`: 
https://github.com/volkamerlab/kissim/blob/master/kissim/definitions.py

In [12]:
{subpocket_name: tuple(values) for subpocket_name, values in moment_cutoff.items()}

{1: (11.0, 17.0), 2: (2.0, 7.0), 3: (-3.0, 7.0)}