# Performance summary over different `kissim` setups

Summarize performance of different `kissim` setups:

- Profiling vs. `kissim` AUCs: How well does `kissim` reflect profiling data?
- Phylogenetic `kissim` tree: How well do selected on- and off-targets cluster together?
- Top `kissim` ranks: How high do selected on- and off-targets rank in `kissim`?

DFG-in conformations only!

In [1]:
from pathlib import Path

import pandas as pd
import seaborn as sns
from Bio import Phylo

In [2]:
cm = sns.light_palette("blue", as_cmap=True)

In [3]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / "../../results_archive"

## `kissim` setups

In [4]:
RUN_IDS = [
    "20210712",
    "20210804-1",
    "20210804-2",
    "20210804-3",
    "20210804-4",
    "20210804-5",
    "20210810-1",
    "20210810-2",
    "20210812-1",
    "20210812-2",
]
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

In [5]:
ON_OFF_PAIRS = [
    ["EGFR", ["SLK", "LOK", "GAK"]],  # Erlotinib
    ["SLK", ["LOK"]],  # Erlotinib
    ["DRAK2", ["CaMKK2"]],
    ["ABL2", ["AurA"]],  # VX-680/MK-0457
    ["ABL1", ["GAK"]],  # Dasatinib
    ["GAK", ["DAPK3"]],  # Inrebic
    ["AurC", ["KIT"]],  # Inlyta
    ["KIT", ["AMPKa2", "FMS"]],  # JNJ-28312141
    ["ABL1", ["BMPR1B"]],  # PD-173955
]

## Profiling vs. `kissim` AUCs

In [6]:
auc_dfs = {}
mean_df = []
median_df = []
std_df = []
for run_id in RUN_IDS:
    path = RESULTS / f"{run_id}/dfg_in"
    auc_df = pd.read_csv(path / "auc.csv")
    auc_dfs[run_id] = auc_df[["15", "100", "110", "101", "111"]]

    mean = auc_dfs[run_id].describe().loc["mean", :]
    mean.name = run_id
    mean_df.append(mean)

    median = auc_dfs[run_id].describe().loc["50%", :]
    median.name = run_id
    median_df.append(median)

    std = auc_dfs[run_id].describe().loc["std", :]
    std.name = run_id
    std_df.append(std)
mean_df = pd.concat(mean_df, axis=1)
median_df = pd.concat(median_df, axis=1)
std_df = pd.concat(std_df, axis=1)

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_in/auc.csv'

### Mean

In [7]:
mean_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [8]:
mean_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [9]:
mean_df.style.highlight_max(axis=0, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [10]:
mean_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

### Median

In [11]:
median_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [12]:
median_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [13]:
median_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

### Standard deviation

In [14]:
std_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [15]:
std_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [16]:
std_df.style.highlight_max(axis=0, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [17]:
std_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

## Top `kissim` ranks

In [18]:
def get_ranks(distance_matrix, rank_from, rank_to):
    """
    Get ranks for all kinases w.r.t. to a query kinase.
    """

    ranks = pd.concat(
        [
            distance_matrix[rank_from].sort_values(),
            distance_matrix[rank_from].sort_values().rank(),
        ],
        axis=1,
    )
    ranks.columns = ["distance", "rank"]
    if rank_to is not None:
        ranks = ranks.loc[rank_to, :]
    pair_names = [f"{rank_from}-{rank_to}" for i in rank_to]
    return ranks, pair_names

In [19]:
%%time

results_list = []

for run_id in RUN_IDS[1:]:
    for weighting in WEIGHTING_SCHEMES:
        results = []
        columns = []
        results.extend([run_id, weighting])

        kinase_matrix_path = (
            RESULTS / f"{run_id}/dfg_in/fingerprint_distances_to_kinase_matrix.csv"
        )
        kinase_matrix = pd.read_csv(kinase_matrix_path, index_col=0)

        for pair in ON_OFF_PAIRS:
            ranks, pair_names = get_ranks(kinase_matrix, pair[0], pair[1])
            results.extend(ranks["rank"].to_list())

        results_list.append(results)

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210804-1/dfg_in/fingerprint_distances_to_kinase_matrix.csv'

In [20]:
matrix_ranks_df = pd.DataFrame(
    results_list,
    columns=["run_id", "weighting"] + [f"{i[0]}-{j}" for i in ON_OFF_PAIRS for j in i[1]],
)
matrix_ranks_df = matrix_ranks_df.set_index(["run_id", "weighting"])
cm = sns.light_palette("blue", as_cmap=True, reverse=True)

In [21]:
matrix_ranks_df.style.applymap(lambda x: "background-color : yellow" if x < 25 else "")

Unnamed: 0_level_0,Unnamed: 1_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
run_id,weighting,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [22]:
matrix_ranks_df.style.background_gradient(cmap=cm, axis=None)

ValueError: zero-size array to reduction operation fmin which has no identity

<pandas.io.formats.style.Styler at 0x7fa297b49cd0>

## Phylogenetic `kissim` tree

In [23]:
import itertools
import numpy as np

In [24]:
def pairs_to_symmetric_matrix(pairs):
    """
    Create symmetric matrix with diagonal of 0.0 from pair combinations.
    """

    pairs1 = pd.DataFrame(pairs)
    pairs2 = pairs1[[1, 0, 2]]
    pairs2.columns = [0, 1, 2]
    pairs_all = pd.concat([pairs1, pairs2])
    matrix = pairs_all.pivot(index=0, columns=1, values=2)
    np.fill_diagonal(matrix.values, 0.0)
    symmetic_matrix = pd.DataFrame(matrix.values, columns=matrix.columns, index=matrix.index)
    return symmetic_matrix

In [25]:
def get_tree_distance_matrix(tree):
    """
    Get a matrix of all-against-all kinase distances in the kinase tree.
    """

    kinases = [clade.name for clade in tree.get_terminals()]
    kinase_pairs = itertools.combinations(kinases, 2)

    kinase_pairs_tree_distances = []

    for kinase1, kinase2 in kinase_pairs:
        kinase_pairs_tree_distances.append([kinase1, kinase2, tree.distance(kinase1, kinase2)])

    tree_distance_matrix = pairs_to_symmetric_matrix(kinase_pairs_tree_distances)

    return tree_distance_matrix

In [26]:
%%time

results_list = []

for run_id in RUN_IDS:
    print(run_id)
    for weighting in WEIGHTING_SCHEMES:
        for cmethod in CLUSTERING_METHODS:
            results = []
            results.extend([run_id, weighting, cmethod])

            tree_path = RESULTS / f"{run_id}/dfg_in/trees/tree_0.8_{weighting}_{cmethod}.tree"
            kissim_tree = Phylo.read(tree_path, "newick")
            tree_distance_matrix = get_tree_distance_matrix(kissim_tree)

            for pair in ON_OFF_PAIRS:
                ranks, pair_names = get_ranks(tree_distance_matrix, pair[0], pair[1])
                results.extend(ranks["rank"].to_list())

            results_list.append(results)

20210712


FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_in/trees/tree_0.8_15_ward.tree'

In [27]:
tree_ranks_df = pd.DataFrame(
    results_list,
    columns=["run_id", "weighting", "cmethod"]
    + [f"{i[0]}-{j}" for i in ON_OFF_PAIRS for j in i[1]],
)
tree_ranks_df = tree_ranks_df.set_index(["run_id", "weighting", "cmethod"])
cm = sns.light_palette("blue", as_cmap=True, reverse=True)

In [28]:
tree_ranks_df.style.applymap(lambda x: "background-color : yellow" if x < 25 else "")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
run_id,weighting,cmethod,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


In [29]:
tree_ranks_df.loc[:, "15", :].style.background_gradient(cmap=cm, axis=0)

KeyError: '15'