# Test dataset

This dataset is used to run the notebooks on the CI.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import csv
import shutil

import pandas as pd
from opencadd.databases.klifs import setup_remote, setup_local

from src.paths import PATH_DATA_KLIFS_DOWNLOAD, PATH_DATA



Folder does not exist: /home/dominique/Documents/GitHub/kissim_app/src/../results
Use test folder instead: /home/dominique/Documents/GitHub/kissim_app/src/../test/results


In [3]:
HERE = Path(_dh[-1])  # noqa: F821

## Prepare KLIFS test dataset

Use KLIFS reference structures and a few DFG-out structures.

In [4]:
klifs_reference_structures = pd.read_csv(
    PATH_DATA / "external/structures/klifs_reference_structures.csv", delimiter=";"
)

In [5]:
session = setup_local(PATH_DATA_KLIFS_DOWNLOAD)

In [6]:
# Get KLIFS reference structures
klifs_reference_structures.loc[
    klifs_reference_structures["structure.alternate_model"] == " ", 
    "structure.alternate_model"
] = "-"
klifs_reference_structures["structure.pdb_id"] = (
    klifs_reference_structures["structure.pdb_id"].str.lower()
)
# Look them up in local KLIFS download (contains filepaths)
structures_selected = []
for _, x in klifs_reference_structures.iterrows():
    structure = session.structures.by_structure_pdb_id(
        x["structure.pdb_id"],
        None if x["structure.alternate_model"] == " " else x["structure.alternate_model"],
        x["structure.chain"],
    )
    structures_selected.append(structure)
structures_selected = pd.concat(structures_selected)

In [7]:
# Add some DFG-out structures, incl. ABL1
structures = session.structures.all_structures()
structures_selected = pd.concat(
    [
        structures_selected,
        structures[structures["structure.dfg"] == "out"].sample(5),
        #structures[structures["kinase.klifs_name"] == "ABL1"].sample(1)
    ]
)

In [8]:
# Add more kinases
structures = session.structures.by_kinase_name(
    ["ErbB2", "p110a", "MET", "SLK", "KDR", "BRAF", "p38a", "LOK", "GAK"]  #, "ABL1"
)
structures = structures[structures["structure.dfg"] == "in"]
structures = structures.groupby("kinase.klifs_name").first().reset_index()
structures_selected = pd.concat(
    [structures_selected, structures]
)

#### Write test `KLIFS_export.csv`

In [9]:
klifs_export = pd.read_csv(PATH_DATA_KLIFS_DOWNLOAD / "KLIFS_export.csv")
print(klifs_export.shape)
klifs_export_selected = pd.merge(
    klifs_export, 
    structures_selected, 
    how="inner", 
    left_on=["PDB", "CHAIN", "ALTERNATE_MODEL"], 
    right_on=["structure.pdb_id", "structure.chain", "structure.alternate_model"]
)[klifs_export.columns]
klifs_export_selected.shape

(11806, 13)


(38, 13)

In [10]:
klifs_export_selected.to_csv(
    HERE / "data/external/klifs_test/KLIFS_export.csv", 
    quoting=csv.QUOTE_ALL, 
    index=None
)

#### Write test `overview.csv`

In [11]:
structures_selected.loc[
    structures_selected["structure.alternate_model"] == "-", 
    "structure.alternate_model"
] = " "

In [12]:
overview = pd.read_csv(PATH_DATA_KLIFS_DOWNLOAD / "overview.csv")
print(overview.shape)
overview_selected = pd.merge(
    overview, 
    structures_selected, 
    how="inner", 
    left_on=["pdb", "chain", "alt"], 
    right_on=["structure.pdb_id", "structure.chain", "structure.alternate_model"]
)[overview.columns]
overview_selected.shape

(11806, 27)


(38, 27)

In [13]:
overview_selected.to_csv(
    HERE / "data/external/klifs_test/overview.csv", 
    index=None
)

#### Copy test PDB files

In [14]:
for folder in structures_selected["structure.filepath"]:
    shutil.copytree(
        src=PATH_DATA_KLIFS_DOWNLOAD / folder, 
        dst=HERE / "data/external/klifs_test" / folder
    )

#### Check KLIFS test folder

In [15]:
setup_local(HERE / "data/external/klifs_test")

<opencadd.databases.klifs.session.Session at 0x7f90438a76a0>

#### Write test structure KLIFS IDs files

In [16]:
structures_selected["structure.klifs_id"].to_csv(
    HERE / f"data/processed/structure_klifs_ids_all.txt", index=None, header=None
)

In [17]:
for dfg in ["in", "out"]:
    structures_selected[
        structures_selected["structure.dfg"] == dfg
    ]["structure.klifs_id"].to_csv(
        HERE / f"data/processed/structure_klifs_ids_dfg_{dfg}.txt", index=None, header=None
    )

## Generate `kissim` fingerprints and distances

### All

In [18]:
!bash src/kissim_encode_compare.sh all klifs_test normalized full

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: all
KLIFS download folder: klifs_test
Normalized? normalized
Residue subset? full
INFO:kissim.encoding.fingerprint_generator:GENERATE FINGERPRINTS
INFO:kissim.encoding.fingerprint_generator:Number of input structures: 38
INFO:kissim.encoding.fingerprint_generator:Fingerprint generation started at: 2021-11-22 13:12:41.905818
INFO:kissim.utils:Number of cores used: 8.
INFO:kissim.encoding.fingerprint_generator:3180: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:2696: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:1894: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:4554: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:2228: Generate fingerprint..

In [19]:
!bash src/kissim_weights.sh all

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: all
INFO:kissim.api.weights:Read feature distances from /home/dominique/Documents/GitHub/kissim_app/test/results/all/feature_distances.csv.bz2...
INFO:kissim.comparison.fingerprint_distance_generator:GENERATE FINGERPRINT DISTANCES
INFO:kissim.comparison.fingerprint_distance_generator:Fingerprint distance generation started at: 2021-11-22 13:13:26.357916
INFO:kissim.comparison.fingerprint_distance_generator:Feature weights: [0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.    0.    0.    0.
 0.    0.    0.   ]
Calculate pairwise fingerprint distance: 100%|█| 703/703 [00:00<00:00, 29447.97i
Calculate pairwise fingerprint coverage: 100%|█| 703/703 [00:00<00:00, 31958.59i
INFO:kissim.comparison.fingerprint_distance_generator

### DFG-in

In [20]:
!bash src/kissim_encode_compare.sh dfg_in klifs_test normalized full

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: dfg_in
KLIFS download folder: klifs_test
Normalized? normalized
Residue subset? full
INFO:kissim.encoding.fingerprint_generator:GENERATE FINGERPRINTS
INFO:kissim.encoding.fingerprint_generator:Number of input structures: 32
INFO:kissim.encoding.fingerprint_generator:Fingerprint generation started at: 2021-11-22 13:13:41.568981
INFO:kissim.utils:Number of cores used: 8.
INFO:kissim.encoding.fingerprint_generator:2696: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:1912: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:3180: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:2267: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:5616: Generate fingerprin

In [21]:
!bash src/kissim_weights.sh dfg_in

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: dfg_in
INFO:kissim.api.weights:Read feature distances from /home/dominique/Documents/GitHub/kissim_app/test/results/dfg_in/feature_distances.csv.bz2...
INFO:kissim.comparison.fingerprint_distance_generator:GENERATE FINGERPRINT DISTANCES
INFO:kissim.comparison.fingerprint_distance_generator:Fingerprint distance generation started at: 2021-11-22 13:14:21.348687
INFO:kissim.comparison.fingerprint_distance_generator:Feature weights: [0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.    0.    0.    0.
 0.    0.    0.   ]
Calculate pairwise fingerprint distance: 100%|█| 496/496 [00:00<00:00, 29119.77i
Calculate pairwise fingerprint coverage: 100%|█| 496/496 [00:00<00:00, 27277.16i
INFO:kissim.comparison.fingerprint_distance_gen

### DFG-out

In [22]:
!bash src/kissim_encode_compare.sh dfg_out klifs_test normalized full

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: dfg_out
KLIFS download folder: klifs_test
Normalized? normalized
Residue subset? full
INFO:kissim.encoding.fingerprint_generator:GENERATE FINGERPRINTS
INFO:kissim.encoding.fingerprint_generator:Number of input structures: 5
INFO:kissim.encoding.fingerprint_generator:Fingerprint generation started at: 2021-11-22 13:14:36.926368
INFO:kissim.utils:Number of cores used: 8.
INFO:kissim.encoding.fingerprint_generator:11824: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:7569: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:3599: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:248: Generate fingerprint...
INFO:kissim.encoding.fingerprint_generator:10006: Generate fingerpri

In [23]:
!bash src/kissim_weights.sh dfg_out

# packages in environment at /home/dominique/.local/miniconda/envs/kissim-app-dev:
#
# Name                    Version                   Build  Channel
kissim                    0.4.1+9.g378e5b5           dev_0    <develop>
Job settings
------------
Structure subset: dfg_out
INFO:kissim.api.weights:Read feature distances from /home/dominique/Documents/GitHub/kissim_app/test/results/dfg_out/feature_distances.csv.bz2...
INFO:kissim.comparison.fingerprint_distance_generator:GENERATE FINGERPRINT DISTANCES
INFO:kissim.comparison.fingerprint_distance_generator:Fingerprint distance generation started at: 2021-11-22 13:14:49.299712
INFO:kissim.comparison.fingerprint_distance_generator:Feature weights: [0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.    0.    0.    0.
 0.    0.    0.   ]
Calculate pairwise fingerprint distance: 100%|█| 10/10 [00:00<00:00, 23523.86it/
Calculate pairwise fingerprint coverage: 100%|█| 10/10 [00:00<00:00, 25842.91it/
INFO:kissim.comparison.fingerprint_distance_g