# Extract pocket residue CA atom coordinates

We extract the coordinates for all pockets' residue CA atoms to be used in other notebooks.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from opencadd.databases.klifs import setup_local
from kissim.io import PocketDataFrame

from src.paths import PATH_DATA, PATH_RESULTS, PATH_DATA_KLIFS_DOWNLOAD, PATH_DATA_KLIFS_DOWNLOAD_TEST



In [3]:
HERE = Path(_dh[-1])  # noqa: F821
DATA = PATH_DATA
RESULTS = PATH_RESULTS / "all"

In [4]:
try:
    LOCAL = setup_local(PATH_DATA_KLIFS_DOWNLOAD)
except FileNotFoundError:
    # Use this KLIFS dataset for CI
    LOCAL = setup_local(PATH_DATA_KLIFS_DOWNLOAD_TEST)

print(f"Number of structures: {len(LOCAL._database)}")

Local structure is not part of SerializableSwaggerClient(https://dev.klifs.net/api_v2) (yet? any more?): 7ree-A-A
Expected type to be dict for value [400, 'KLIFS error: An unknown ligand ID was provided'] to unmarshal to a <class 'abc.Error'>.Was <class 'list'> instead.
Local structure is not part of SerializableSwaggerClient(https://dev.klifs.net/api_v2) (yet? any more?): 7ree-B-A
Expected type to be dict for value [400, 'KLIFS error: An unknown ligand ID was provided'] to unmarshal to a <class 'abc.Error'>.Was <class 'list'> instead.
Local structure is not part of SerializableSwaggerClient(https://dev.klifs.net/api_v2) (yet? any more?): 7n3u---A
Expected type to be dict for value [400, 'KLIFS error: An unknown ligand ID was provided'] to unmarshal to a <class 'abc.Error'>.Was <class 'list'> instead.
Local structure is not part of SerializableSwaggerClient(https://dev.klifs.net/api_v2) (yet? any more?): 6w9e-B-A
Expected type to be dict for value [400, 'KLIFS error: An unknown ligand 

Number of structures: 11795


## Load structure KLIFS IDs of interest

Let's load our pre-calculated fingerprint saved in a _json_ file as `FingerprintGenerator` object.

In [5]:
structure_klifs_ids = pd.read_csv(
    DATA / "processed/structure_klifs_ids_all.txt", squeeze=True, header=None
).to_list()
print(f"Number of structures: {len(structure_klifs_ids)}")

Number of structures: 4685


## Extract pocket residue CA atom coordinates

In [6]:
def pocket_residue_ca_atom_coordinates(structure_klifs_ids, klifs_session):

    coordinates = []

    for structure_klifs_id in structure_klifs_ids:
        pocket = PocketDataFrame.from_structure_klifs_id(structure_klifs_id, klifs_session)
        if pocket is not None:
            ca_atoms = pocket.residues.dropna().merge(
                pocket.ca_atoms, how="left", on=["residue.id"]
            )
            ca_atoms["structure.klifs_id"] = structure_klifs_id
            ca_atoms = ca_atoms.set_index(
                ["structure.klifs_id", "residue.ix"],
            )[["atom.x", "atom.y", "atom.z"]]
            coordinates.append(ca_atoms)

    coordinates = pd.concat(coordinates)
    return coordinates

In [7]:
%time coordinates = pocket_residue_ca_atom_coordinates(structure_klifs_ids, LOCAL)  # noqa: E501

13042: Local complex.pdb or pocket.pdb file missing: /home/dominique/Documents/GitHub/kissim_app/src/../data/external/structures/20210902_KLIFS_HUMAN/HUMAN/GPRK5/6pjx_altA_chainA/complex.pdb


CPU times: user 15min 30s, sys: 1.01 s, total: 15min 31s
Wall time: 15min 34s


## Save coordinates

In [8]:
coordinates.to_csv(  # noqa: F821
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz", compression="gzip"
)

## Load coordinates

In [9]:
coordinates = pd.read_csv(
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz",
    header=0,
    index_col=[0, 1],
)
coordinates

Unnamed: 0_level_0,Unnamed: 1_level_0,atom.x,atom.y,atom.z
structure.klifs_id,residue.ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,8.332,18.829,52.312
1,2,9.247,16.975,49.126
1,3,6.166,15.784,47.231
1,4,7.642,14.754,43.861
1,5,10.860,13.971,41.981
...,...,...,...,...
13853,81,1.884,18.534,33.726
13853,82,2.756,20.901,30.782
13853,83,5.668,18.490,29.982
13853,84,4.271,17.749,26.447


In [10]:
n_structures = len(coordinates.index.get_level_values("structure.klifs_id").unique())
print(f"Number of structures: {n_structures}")

Number of structures: 4684
