In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
from pathlib import Path

import pandas as pd
from opencadd.databases.klifs import setup_remote, setup_local

from kissim.encoding import Fingerprint, FingerprintGenerator



In [3]:
logger_opencadd = logging.getLogger("opencadd")
logger_opencadd.setLevel(logging.INFO)
formatter = logging.Formatter(logging.BASIC_FORMAT)
s_handler = logging.StreamHandler()
s_handler.setFormatter(formatter)
logger_opencadd.addHandler(s_handler)

## Extract new structures

In [4]:
klifs_ids1 = pd.read_csv(
    "https://github.com/volkamerlab/opencadd/raw/master/opencadd/data/klifs_ids.20210115.csv.gz"
)
klifs_ids2 = pd.read_csv(
    "https://github.com/volkamerlab/opencadd/raw/master/opencadd/data/klifs_ids.20210701.csv.gz"
)

In [5]:
diff = list(
    set(klifs_ids2["structure.klifs_id"].to_list())
    - set(klifs_ids1["structure.klifs_id"].to_list())
)

In [6]:
print(f"Number of new structures: {len(diff)}")

Number of new structures: 531


In [7]:
print("KLIFS ID numbering consecutive?")
len(diff) == (diff[-1] - diff[0] + 1)

KLIFS ID numbering consecutive?


True

In [8]:
print(f"Structures with IDs from {diff[0]} are new.")

Structures with IDs from 13232 are new.


## Problematic structures

In [9]:
structure_klifs_ids_io = [1243, 13623, 13625, 13626, 13624]
structure_klifs_ids_encoding = [13556]
structure_klifs_ids = structure_klifs_ids_io + structure_klifs_ids_encoding

Why is 1243 a problem now?

In [10]:
klifs_ids1[klifs_ids1["structure.klifs_id"] == 1243]

Unnamed: 0,structure.klifs_id,structure.pdb_id,structure.alternate_model,structure.chain,kinase.klifs_name,kinase.klifs_id,ligand.expo_id
1219,1243,2ogv,-,A,PDGFRb,453,-


In [11]:
klifs_ids2[klifs_ids2["structure.klifs_id"] == 1243]

Unnamed: 0,structure.klifs_id,structure.pdb_id,structure.alternate_model,structure.chain,kinase.klifs_name,kinase.klifs_id,ligand.expo_id
1219,1243,2ogv,-,A,FMS,449,-


## Local session

In [12]:
local = setup_local(
    "/home/dominique/Documents/GitHub/kissim_app/data/external/structures/20210630_KLIFS_HUMAN/"
)

INFO:opencadd.databases.klifs.api:Set up local session...
INFO:opencadd.databases.klifs.local:Load overview.csv...
INFO:opencadd.databases.klifs.local:Load KLIFS_export.csv...
INFO:opencadd.databases.klifs.local:Merge both csv files...
INFO:opencadd.databases.klifs.local:Add paths to coordinate folders to structures...
INFO:opencadd.databases.klifs.local:Add KLIFS IDs to structures (uses remote since not available locally!)...
INFO:opencadd.databases.klifs.api:Local session is ready!


In [13]:
f_local = FingerprintGenerator.from_structure_klifs_ids(
    structure_klifs_ids, klifs_session=local
)

The following structure could not be loaded into kissim: 1243: Length of values (1284) does not match length of index (1313)
1243: Empty fingerprint (data unaccessible).
The following structure could not be loaded into kissim: 13623: Length of values (991) does not match length of index (1461)
13623: Empty fingerprint (data unaccessible).
The following structure could not be loaded into kissim: 13625: Length of values (991) does not match length of index (1461)
13625: Empty fingerprint (data unaccessible).
The following structure could not be loaded into kissim: 13626: Length of values (562) does not match length of index (1367)
13626: Empty fingerprint (data unaccessible).
The following structure could not be loaded into kissim: 13624: Length of values (991) does not match length of index (1461)
13624: Empty fingerprint (data unaccessible).
INFO:opencadd.structure.pocket.base:Pocket None (set pocket residues): The following input residues PDB IDs were assigned to the value None becaus

In [14]:
f_local.data

{}

## Remote session

In [15]:
remote = setup_remote()

INFO:opencadd.databases.klifs.api:Set up remote session...
INFO:opencadd.databases.klifs.api:Remote session is ready!


In [16]:
f_remote = FingerprintGenerator.from_structure_klifs_ids(
    structure_klifs_ids, klifs_session=remote
)

INFO:opencadd.structure.pocket.base:Pocket None (set pocket residues): The following input residues PDB IDs were assigned to the value None because they cannot be cast to an integer (residue PDB ID, residue index): [('_', 6), ('_', 7), ('_', 8), ('_', 9), ('_', 83), ('_', 84), ('_', 85)]
INFO:opencadd.structure.pocket.base:Pocket 13556 (set pocket residues): The following input residues PDB IDs were assigned to the value None because they cannot be cast to an integer (residue PDB ID, residue index): [(None, 6), (None, 7), (None, 8), (None, 9), (None, 83), (None, 84), (None, 85)]
INFO:opencadd.structure.pocket.core:Pocket 13556: Missing pocket CA atoms. The pocket center is calculated based on 79 CA atoms (total number of pocket residues is 85).
Fingerprint generation throw error for 13556:  '2 residues were found, but must be 1 or 0.'


In [17]:
f_remote.data

{1243: <kissim.encoding.fingerprint.Fingerprint at 0x7fb1fab3e040>,
 13623: <kissim.encoding.fingerprint.Fingerprint at 0x7fb1fb5ee580>,
 13625: <kissim.encoding.fingerprint.Fingerprint at 0x7fb1fb851c70>,
 13626: <kissim.encoding.fingerprint.Fingerprint at 0x7fb1fb615700>,
 13624: <kissim.encoding.fingerprint.Fingerprint at 0x7fb1fb851b20>}