# Min-max cutoffs for continuous (=spatial) features

Define the min-max cutoffs for the min-max normalization of continuous features such as the distances and moments features: Use the floor/ceiling values of the minimum/maximum values of the distances w.r.t. to each subpocket center.

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

from kissim.encoding import FingerprintGenerator



In [2]:
HERE = Path(_dh[-1])
RESULTS = HERE / "../../results/"

## Load fingerprints

In [3]:
fingerprint_generator = FingerprintGenerator.from_json(RESULTS / "fingerprints_clean.json")
print(f"Number of fingerprints: {len(fingerprint_generator.data)}")
# NBVAL_CHECK_OUTPUT

Number of fingerprints: 4916


## Distances features

In [4]:
features_d = fingerprint_generator.distances_exploded()
features_d

Unnamed: 0_level_0,Unnamed: 1_level_0,hinge_region,dfg_region,front_pocket,center
structure_klifs_id,residue_ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3835,1,12.798095,19.079805,14.099948,17.262426
3835,2,11.640075,16.481350,11.632327,15.175342
3835,3,9.240445,15.125156,8.279718,12.447825
3835,4,10.344351,13.665039,7.234169,11.853809
3835,5,12.414604,12.788149,9.114764,12.903294
...,...,...,...,...,...
7219,81,8.892165,7.109349,6.723064,3.793342
7219,82,11.366709,6.088001,10.422721,6.788561
7219,83,13.247789,5.919116,11.564625,9.319808
7219,84,16.129894,9.708284,13.088131,11.473628


In [5]:
features_d_stats = features_d.describe(percentiles=[0.001, 0.01, 0.99, 0.999])
features_d_stats

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
count,410664.0,406451.0,410681.0,411290.0
mean,12.871981,13.576839,13.032085,12.003979
std,4.564724,4.861037,4.278035,3.48125
min,2.68981,0.873535,1.355564,1.001008
0.1%,3.702853,3.576234,4.513442,2.312039
1%,4.112844,4.659211,5.452539,3.050403
50%,12.554697,13.297532,12.45708,12.020114
99%,23.06277,26.71624,23.409007,21.176027
99.9%,25.056192,29.279455,25.563321,23.517292
max,30.791023,33.760345,32.931515,28.318304


In [6]:
features_d_stats.loc[["min", "max"], :]

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
min,2.68981,0.873535,1.355564,1.001008
max,30.791023,33.760345,32.931515,28.318304


In [7]:
distance_cutoff = pd.concat(
    [np.floor(features_d_stats.loc["min", :]), np.ceil(features_d_stats.loc["max", :])], axis=1
).transpose()
distance_cutoff

Unnamed: 0,hinge_region,dfg_region,front_pocket,center
min,2.0,0.0,1.0,1.0
max,31.0,34.0,33.0,29.0


Format as dictionary as needed for `kissim`: 
https://github.com/volkamerlab/kissim/blob/master/kissim/definitions.py

In [8]:
{subpocket_name: tuple(values) for subpocket_name, values in distance_cutoff.items()}

{'hinge_region': (2.0, 31.0),
 'dfg_region': (0.0, 34.0),
 'front_pocket': (1.0, 33.0),
 'center': (1.0, 29.0)}

## Moments features

In [9]:
features_m = fingerprint_generator.moments_exploded()
features_m = features_m.stack().unstack(level=1).reset_index(drop=True)

In [10]:
features_m_stats = features_m.describe(percentiles=[0.001, 0.01, 0.99, 0.999])
features_m_stats

moment,1,2,3
count,19585.0,19585.0,19585.0
mean,12.869594,4.281653,2.777692
std,0.618134,0.557168,1.06693
min,11.311548,2.838044,-2.546785
0.1%,11.56372,3.035773,-2.101982
1%,11.681274,3.260454,-1.694259
50%,12.920924,4.419078,2.882199
99%,14.185183,5.282933,4.704321
99.9%,14.834405,5.55495,5.637431
max,16.848968,6.377253,6.178007


In [11]:
moment_cutoff = pd.concat(
    [np.floor(features_m_stats.loc["min", :]), np.ceil(features_m_stats.loc["max", :])], axis=1
).transpose()
moment_cutoff

moment,1,2,3
min,11.0,2.0,-3.0
max,17.0,7.0,7.0


Format as dictionary as needed for `kissim`: 
https://github.com/volkamerlab/kissim/blob/master/kissim/definitions.py

In [12]:
{subpocket_name: tuple(values) for subpocket_name, values in moment_cutoff.items()}

{1: (11.0, 17.0), 2: (2.0, 7.0), 3: (-3.0, 7.0)}