# Extract pocket residue CA atom coordinates

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from opencadd.databases.klifs import setup_local

from kissim.io import PocketDataFrame
from kissim.encoding import FingerprintGenerator



In [3]:
HERE = Path(_dh[-1])
DATA = HERE / "../../data/"
RESULTS = HERE / "../../results/"

In [4]:
try:
    LOCAL = setup_local(DATA / "external/20210114_KLIFS_HUMAN")
except FileNotFoundError:
    # Use this KLIFS dataset for CI
    LOCAL = setup_local(DATA / "external/20201223_KLIFS_HUMAN_ABL2")

## Load structure KLIFS IDs of interest

Let's load our pre-calculated fingerprint saved in a _json_ file as `FingerprintGenerator` object.

In [5]:
structure_klifs_ids = pd.read_csv(DATA / "processed/structure_klifs_ids.txt", squeeze=True, header=None).to_list()
print(f"Number of structures: {len(structure_klifs_ids)}")

Number of structures: 4944


## Extract pocket residue CA atom coordinates

In [6]:
def pocket_residue_ca_atom_coordinates(structure_klifs_ids, klifs_session):

    coordinates = []

    for structure_klifs_id in structure_klifs_ids:
        pocket = PocketDataFrame.from_structure_klifs_id(structure_klifs_id, klifs_session)
        if pocket is not None:
            ca_atoms = pocket.residues.dropna().merge(pocket.ca_atoms, how="left", on=["residue.id"])
            ca_atoms["structure.klifs_id"] = structure_klifs_id
            ca_atoms = ca_atoms.set_index(["structure.klifs_id", "residue.ix"], )[["atom.x", "atom.y", "atom.z"]]
            coordinates.append(ca_atoms)
        #else:
            #print(f"{structure_klifs_id}: Pocket is None.")

    coordinates = pd.concat(coordinates)
    return coordinates

In [7]:
%time coordinates = pocket_residue_ca_atom_coordinates(structure_klifs_ids, LOCAL)

10437: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
6655: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
1987: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
7362: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
5489: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
5458: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
10737: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
1940: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
3544: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
13041: Local complex.pdb or pocket.pdb file missing: /home/dominique/Documents/GitHub/kissim_app/notebooks/dataset/../../data/external

CPU times: user 14min 29s, sys: 804 ms, total: 14min 30s
Wall time: 14min 30s


## Save coordinates

In [8]:
coordinates.to_csv(DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz", compression="gzip")

## Load coordinates

In [9]:
coordinates = pd.read_csv(DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz", header=0, index_col=[0, 1])
coordinates

Unnamed: 0_level_0,Unnamed: 1_level_0,atom.x,atom.y,atom.z
structure.klifs_id,residue.ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3835,1,6.056,14.974,51.863
3835,2,6.268,13.247,48.488
3835,3,2.954,13.926,46.613
3835,4,3.490,11.680,43.584
3835,5,6.190,10.086,41.438
...,...,...,...,...
7219,81,2.003,19.768,33.851
7219,82,3.749,21.637,31.096
7219,83,6.670,19.352,30.231
7219,84,5.273,17.350,27.325


In [10]:
print(f"Number of structures: {len(coordinates.index.get_level_values('structure.klifs_id').unique())}")

Number of structures: 4926
