# Subsetting fingerprints

In [1]:
from pathlib import Path

import numpy as np

from kissim.encoding import FingerprintGenerator
from kissim.comparison import FingerprintDistanceGenerator



In [2]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / "../../results/dfg_out"

## Residue subsets

In [3]:
selected_residue_ixs = {
    "dfg_all": [1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,19,20,21,23,24,25,27,28,31,35,36,37,38,41,43,44,45,46,47,48,49,50,51,52,53,54,55,56,59,60,61,64,66,67,68,69,70,72,74,75,76,77,79,80,81,82,83,84,85],
    "dfg_in": [1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,19,20,21,24,25,27,28,36,37,38,43,44,45,46,47,48,49,50,51,52,54,55,59,70,72,74,75,76,77,80,81,82,83,84,85],
    "dfg_out": [3,4,5,6,7,8,9,11,12,13,15,16,17,19,20,21,23,24,25,27,28,31,35,36,37,38,43,44,45,46,47,48,49,50,51,52,54,55,60,61,64,66,67,68,69,70,74,75,77,79,80,81,82,83,84,85],
}

In [4]:
{k: len(v) for k, v in selected_residue_ixs.items()}

{'dfg_all': 65, 'dfg_in': 51, 'dfg_out': 56}

## Load fingerprints

In [5]:
fingerprints = FingerprintGenerator.from_json(RESULTS / "fingerprints.json")
print(len(fingerprints.data))
print(set([len(v.values_array()) for k, v in fingerprints.data.items()]))

484
{1032}


## `kissim` pipeline with subset fingerprints

In [6]:
def subset_fingerprint_generator(fingerprints, selected_residue_ixs):
    
    selected_residue_ixs = [i - 1 for i in selected_residue_ixs]

    for id_, fp in fingerprints.data.items():
        fp_dict = {}
        for feature_name1, features1 in fp.values_dict.items():
            fp_dict[feature_name1] = {}
            if feature_name1 == "physicochemical":
                for feature_name2, features2 in features1.items():
                    fp_dict[feature_name1][feature_name2] = np.array(features2)[selected_residue_ixs].tolist()
            else:
                for feature_name2, features2 in features1.items():
                    fp_dict[feature_name1][feature_name2] = {}
                    if feature_name2 == "distances":
                        for feature_name3, features3 in features2.items():
                            fp_dict[feature_name1][feature_name2][feature_name3] = np.array(features3)[selected_residue_ixs].tolist()
                    else:
                        fp_dict[feature_name1][feature_name2] = {}
                        for feature_name3, features3 in features2.items():
                            fp_dict[feature_name1][feature_name2][feature_name3] = np.array(features3)

        fp.values_dict = fp_dict
        fp.residue_ids = np.array(fp.residue_ids)[selected_residue_ixs].tolist()
        fp.residue_ixs = np.array(fp.residue_ixs)[selected_residue_ixs].tolist()
        
        fingerprints.data[id_] = fp

    return fingerprints

In [7]:
fps_subset = subset_fingerprint_generator(fingerprints, selected_residue_ixs = selected_residue_ixs["dfg_in"])
print(len(fps_subset.data))
print(set([len(v.values_array()) for k, v in fps_subset.data.items()]))

484
{624}
