# `kissim`-based kinome tree

We generate `kissim`-based kinome matrices and trees based on three different parameters:

- `kissim` runs: With/without charged-THR bug, different KLIFS datasets
- Feature weighting schemes
- DFG conformations
  - Kinase matrix based on all structures/fingerprints (**DFG-in and DFG-out**)
  - Kinase matrix based on structure/fingerprints in **DFG-in** conformation only
  - Kinase matrix based on structure/fingerprints in **DFG-out** conformation only
- Clustering methods

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from Bio import Phylo
from opencadd.databases.klifs import setup_remote
from IPython.display import display, Markdown

from kissim.comparison import FingerprintDistanceGenerator
from kissim.comparison import matrix, tree



In [3]:
matplotlib.rc("font", size=6)

In [4]:
# Set path to this notebook
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / "../../results_archive"

In [5]:
# Set bit coverage between fingerprint pairs
COVERAGE_MIN = 0.8
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
DFG_CONFORMATIONS = ["all", "in", "out"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]
FIG_HEIGHT = {"all": 25, "in": 25, "out": 10}

## Kinase-color mapping

In [6]:
klifs_session = setup_remote()
kinases = klifs_session.kinases.all_kinases(species="Human")
kinases = klifs_session.kinases.by_kinase_klifs_id(kinases["kinase.klifs_id"].to_list())
kinases = kinases[["kinase.klifs_name", "kinase.group"]]
kinase_groups = kinases["kinase.group"].sort_values().unique()
colors = [
    "red",
    "orange",
    "yellowgreen",
    "limegreen",
    "turquoise",
    "cornflowerblue",
    "grey",
    "mediumblue",
    "darkorchid",
    "violet",
]
groups_to_color = {g: c for g, c in zip(kinase_groups, colors)}
kinases["color"] = kinases["kinase.group"].apply(lambda x: groups_to_color[x])
label_colors = kinases.set_index("kinase.klifs_name")["color"].to_dict()
LABEL_COLORS = label_colors

## Helper functions

In [7]:
def load_fingerprint_distances(path_results, weighting):
    """
    Load fingerprint distances (all conformations).
    """

    if weighting == "15":
        filepath = path_results / "fingerprint_distances.csv"
    else:
        filepath = path_results / f"fingerprint_distances_{weighting}.csv"
    fingerprint_distances = FingerprintDistanceGenerator.from_csv(filepath)

    return fingerprint_distances.data

In [8]:
def add_dfg_column(fingerprint_distances_df):
    def _structure_pair_to_dfg(
        structure_klifs_id1, structure_klifs_id2, structure_klifs_id_to_dfg_dict
    ):
        """
        Return DFG conformation for structure pair if they have the same conformation.

        Parameters
        ----------
        structure_klifs_id1 : int
            Structure KLIFS ID.
        structure_klifs_id2 : int
            Structure KLIFS ID.
        structure_klifs_id_to_dfg_dict : dict
            DFG conformations (values) for structure KLIFS IDs (keys).

        Returns
        -------
        str or None
            DFG conformation if input structure pair has the same conformation, else None.
        """
        dfg1 = structure_klifs_id_to_dfg_dict[structure_klifs_id1]
        dfg2 = structure_klifs_id_to_dfg_dict[structure_klifs_id2]
        if dfg1 == dfg2:
            return dfg1
        else:
            return None

    # List structure KLIFS IDs
    structure_klifs_ids = pd.concat(
        [fingerprint_distances_df["structure.1"], fingerprint_distances_df["structure.2"]]
    )
    structure_klifs_ids = structure_klifs_ids.unique().tolist()
    print(f"Number of structures: {len(structure_klifs_ids)}")

    # Create a ID-to-DFG dictionary
    klifs_session = setup_remote()
    structures = klifs_session.structures.by_structure_klifs_id(structure_klifs_ids)
    structure_klifs_id_to_dfg_dict = (
        structures[["structure.klifs_id", "structure.dfg"]]
        .set_index("structure.klifs_id")
        .to_dict()["structure.dfg"]
    )

    # Add DFG conformation of structure pairs (if structure pair has same conformation)
    dfg_conformations = fingerprint_distances_df.apply(
        lambda x: _structure_pair_to_dfg(
            x["structure.1"], x["structure.2"], structure_klifs_id_to_dfg_dict
        ),
        axis=1,
    )
    fingerprint_distances_df["dfg"] = dfg_conformations
    return fingerprint_distances_df

In [9]:
def generate_trees(
    run_id,
    path_results=RESULTS,
    weighting_schemes=WEIGHTING_SCHEMES,
    dfg_conformations=DFG_CONFORMATIONS,
    clustering_methods=CLUSTERING_METHODS,
    coverage_min=COVERAGE_MIN,
):
    def _generate_tree(distances, weighting, dfg):
        if dfg in ["in", "out"]:
            distances = distances[distances["dfg"] == dfg]
        for cmethod in clustering_methods:
            kinase_distance_matrix = matrix.kinase_distance_matrix(
                distances,
                by="minimum",
                fill_diagonal=True,
                coverage_min=coverage_min,
            )
            kinase_distance_matrix.to_csv(
                path / f"matrices/kinase_matrix_{weighting}_{dfg}_{coverage_min}.csv"
            )
            tree.from_distance_matrix(
                kinase_distance_matrix,
                path / f"trees/tree_{weighting}_{dfg}_{cmethod}_{coverage_min}.tree",
                clustering_method=cmethod,
            )

    path = path_results / run_id
    (path / "matrices").mkdir(parents=True, exist_ok=True)
    (path / "trees").mkdir(parents=True, exist_ok=True)

    for weighting in weighting_schemes:
        fingerprint_distances_df = load_fingerprint_distances(path, weighting)
        fingerprint_distances_df = add_dfg_column(fingerprint_distances_df)
        for dfg in dfg_conformations:
            _generate_tree(fingerprint_distances_df, weighting, dfg)

In [10]:
def visualize_trees(
    run_id,
    path_results=RESULTS,
    weighting_schemes=WEIGHTING_SCHEMES,
    dfg_conformations=DFG_CONFORMATIONS,
    clustering_methods=CLUSTERING_METHODS,
    fig_height=FIG_HEIGHT,
    label_colors=LABEL_COLORS,
    coverage_min=COVERAGE_MIN,
):
    for weighting in weighting_schemes:
        display(Markdown(f"### Weighing {weighting}"))
        for dfg in dfg_conformations:
            display(Markdown(f"#### DFG-{dfg}"))
            for cmethod in clustering_methods:
                display(Markdown(f"##### {cmethod}"))
                path_newick = (
                    path_results
                    / f"{run_id}/trees/tree_{weighting}_{dfg}_{cmethod}_{coverage_min}.tree"
                )
                kissim_tree = Phylo.read(path_newick, "newick")
                kissim_tree.ladderize()  # Flip branches so deeper clades are displayed at top

                fig, ax = plt.subplots(1, 1, figsize=(5, fig_height[dfg]))
                ax.set_title(
                    f"run {run_id} | weighting {weighting} | DFG-{dfg} "
                    f"| cmethod {cmethod} | {kissim_tree.count_terminals()} leafs"
                )
                Phylo.draw(kissim_tree, axes=ax, label_colors=label_colors)

## Run 20210508

- Bug charged-threonine
- 20210114 KLIFS download
- DFG-all

**Note: Running the next cell will take about 10 minutes.**

### Default output

- Weighting 15
- DFG-all
- Clustering: ward
- Coverage minimum: 0.0

In [11]:
run_id = "20210508"
path_newick = RESULTS / f"{run_id}/fingerprint_distances_to_kinase_matrix.tree"
kissim_tree = Phylo.read(path_newick, "newick")
kissim_tree.ladderize()  # Flip branches so deeper clades are displayed at top

fig, ax = plt.subplots(1, 1, figsize=(5, FIG_HEIGHT["all"]))
ax.set_title(f"run {run_id} | default | {kissim_tree.count_terminals()} leafs")
Phylo.draw(kissim_tree, axes=ax, label_colors=LABEL_COLORS)

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210508/fingerprint_distances_to_kinase_matrix.tree'

### Customized output

In [12]:
%%time
generate_trees("20210508")

FileExistsError: [Errno 17] File exists: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive'

In [13]:
visualize_trees("20210508")

### Weighing 15

#### DFG-all

##### ward

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210508/trees/tree_15_all_ward_0.8.tree'

## Run 20210708

- Bug charged-threonine FIXED
- 20210114 KLIFS download
- DFG-all

**Note: Running the next cell will take about 10 minutes.**

In [14]:
%%time
generate_trees("20210708")

FileExistsError: [Errno 17] File exists: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive'

In [15]:
visualize_trees("20210708")

### Weighing 15

#### DFG-all

##### ward

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210708/trees/tree_15_all_ward_0.8.tree'

## Run 20210701

- Bug charged-threonine FIXED
- 20210630 KLIFS download 
- DFG-all

**Note: Running the next cell will take about 10 minutes.**

In [16]:
%%time
generate_trees("20210701")

FileExistsError: [Errno 17] File exists: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive'

In [17]:
visualize_trees("20210701")

### Weighing 15

#### DFG-all

##### ward

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210701/trees/tree_15_all_ward_0.8.tree'

## Run 20210712 

- Bug charged-threonine FIXED
- 20210630 KLIFS download
- DFG-all/in/out

In [18]:
%%time

run_id = "20210712"
coverage_min = COVERAGE_MIN

for dfg in DFG_CONFORMATIONS:
    path = RESULTS / run_id / f"dfg_{dfg}"

    for weighting in WEIGHTING_SCHEMES:
        for clustering_method in CLUSTERING_METHODS:

            if clustering_method != "15":
                fingerprint_distances_path = path / "fingerprint_distances.csv"
            else:
                fingerprint_distances_path = path / f"fingerprint_distances_{weighting}.csv"

            (path / "matrices").mkdir(parents=True, exist_ok=True)
            (path / "trees").mkdir(parents=True, exist_ok=True)

            fingerprint_distances = FingerprintDistanceGenerator.from_csv(
                fingerprint_distances_path
            )

            kinase_distance_matrix = matrix.kinase_distance_matrix(
                fingerprint_distances.data,
                by="minimum",
                fill_diagonal=True,
                coverage_min=coverage_min,
            )
            kinase_distance_matrix.to_csv(
                path / f"matrices/kinase_matrix_{weighting}_{dfg}_{coverage_min}.csv"
            )

            tree.from_distance_matrix(
                kinase_distance_matrix,
                path / f"trees/tree_{weighting}_{dfg}_{clustering_method}_{coverage_min}.tree",
                clustering_method=clustering_method,
            )

FileExistsError: [Errno 17] File exists: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive'

In [19]:
def visualize_trees(
    run_id,
    path_results=RESULTS,
    weighting_schemes=WEIGHTING_SCHEMES,
    dfg_conformations=DFG_CONFORMATIONS,
    clustering_methods=CLUSTERING_METHODS,
    fig_height=FIG_HEIGHT,
    label_colors=LABEL_COLORS,
    coverage_min=COVERAGE_MIN,
):
    for weighting in weighting_schemes:
        display(Markdown(f"### Weighing {weighting}"))
        for dfg in dfg_conformations:
            display(Markdown(f"#### DFG-{dfg}"))
            for cmethod in clustering_methods:
                display(Markdown(f"##### {cmethod}"))
                path_newick = (
                    path_results
                    / f"{run_id}/dfg_{dfg}/trees"
                    / f"tree_{weighting}_{dfg}_{cmethod}_{coverage_min}.tree"
                )
                kissim_tree = Phylo.read(path_newick, "newick")
                kissim_tree.ladderize()  # Flip branches so deeper clades are displayed at top

                fig, ax = plt.subplots(1, 1, figsize=(5, fig_height[dfg]))
                ax.set_title(
                    f"run {run_id} | weighting {weighting} | DFG-{dfg} "
                    f"| cmethod {cmethod} | {kissim_tree.count_terminals()} leafs"
                )
                Phylo.draw(kissim_tree, axes=ax, label_colors=label_colors)

In [20]:
visualize_trees("20210712")

### Weighing 15

#### DFG-all

##### ward

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_all/trees/tree_15_all_ward_0.8.tree'

## Run 201909xx

In [21]:
run_id = "201909xx"
path = RESULTS / run_id
(path / "trees").mkdir(parents=True, exist_ok=True)

kinase_matrix_2019 = 1 - pd.read_csv(
    path / "data/best_scores_type2_normballester.csv", index_col=0
)

for clustering_method in CLUSTERING_METHODS:
    tree.from_distance_matrix(
        kinase_matrix_2019,
        path / f"trees/tree_{clustering_method}.tree",
        clustering_method=clustering_method,
    )

FileExistsError: [Errno 17] File exists: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive'

In [22]:
def visualize_trees(
    run_id,
    path_results=RESULTS,
    clustering_methods=CLUSTERING_METHODS,
    fig_height=FIG_HEIGHT,
    label_colors=LABEL_COLORS,
):

    for cmethod in clustering_methods:
        display(Markdown(f"### {cmethod}"))
        path_newick = path_results / f"{run_id}/trees/tree_{cmethod}.tree"
        kissim_tree = Phylo.read(path_newick, "newick")
        kissim_tree.ladderize()  # Flip branches so deeper clades are displayed at top

        fig, ax = plt.subplots(1, 1, figsize=(5, fig_height["in"]))
        ax.set_title(
            f"run {run_id} | weighting 101 | DFG-in "
            f"| cmethod {cmethod} | {kissim_tree.count_terminals()} leafs"
        )
        Phylo.draw(kissim_tree, axes=ax, label_colors=label_colors)

In [23]:
visualize_trees("201909xx")

### ward

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/201909xx/trees/tree_ward.tree'

## Appendix: EGFR's top X kinases

In [24]:
def _top_kinases(kinase_matrix, target):

    kinases_to_target = kinase_matrix[target].sort_values()
    kinases_to_target.name = "distance"
    kinases_to_target.index.name = "kinase.klifs_name"
    kinases_to_target = kinases_to_target.reset_index()

    klifs_session = setup_remote()
    kinase_groups = klifs_session.kinases.by_kinase_name(
        kinases_to_target["kinase.klifs_name"].tolist(), species="Human"
    )[["kinase.klifs_name", "kinase.group"]]

    kinases_to_target = kinases_to_target.merge(kinase_groups)

    return kinases_to_target


def top_kinases(run_id, weighting_schemes=WEIGHTING_SCHEMES):

    for weighting in WEIGHTING_SCHEMES:
        display(Markdown(f"#### Weighting {weighting}"))
        path = RESULTS / f"{run_id}/matrices/kinase_matrix_{weighting}_in_0.8.csv"
        kinase_matrix = pd.read_csv(path, index_col=0)
        for kinase, top_n in zip(kinase_list, top_n_list):
            display(Markdown(kinase))
            display(_top_kinases(kinase_matrix, kinase)[:top_n])

In [25]:
kinase_list = ["EGFR", "SLK", "LOK"]
top_n_list = [40, 10, 10]

### 20210508

In [26]:
top_kinases("20210508")

#### Weighting 15

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210508/matrices/kinase_matrix_15_in_0.8.csv'

### 20210708

In [27]:
top_kinases("20210708")

#### Weighting 15

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210708/matrices/kinase_matrix_15_in_0.8.csv'

### 20210701

In [28]:
top_kinases("20210701")

#### Weighting 15

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210701/matrices/kinase_matrix_15_in_0.8.csv'

### 20210712

In [29]:
top_kinases("20210712/dfg_in")

#### Weighting 15

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_in/matrices/kinase_matrix_15_in_0.8.csv'

### 201909xx

In [30]:
kinase_matrix_2019 = pd.read_csv(
    RESULTS / "201909xx/data/best_scores_type2_normballester.csv", index_col=0
)

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/201909xx/data/best_scores_type2_normballester.csv'

In [31]:
_top_kinases(1 - kinase_matrix_2019, "EGFR")[:20]

NameError: name 'kinase_matrix_2019' is not defined

In [32]:
_top_kinases(1 - kinase_matrix_2019, "SLK")[:10]

NameError: name 'kinase_matrix_2019' is not defined

In [33]:
_top_kinases(1 - kinase_matrix_2019, "LOK")[:10]

NameError: name 'kinase_matrix_2019' is not defined