In [1]:
from pathlib import Path

from IPython.display import display, HTML
import pandas as pd
import seaborn as sns
from Bio import Phylo

In [2]:
cm = sns.light_palette("blue", as_cmap=True)

In [3]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / f"../../results_archive"

## Run IDs

In [4]:
RUN_IDS = [
    "20210712",
    "20210804-1",
    "20210804-2",
    "20210804-3",
    "20210804-4",
    "20210804-5",
]
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

## AUCs per run/weighting

In [5]:
auc_dfs = {}
mean_df = []
median_df = []
std_df = []
for run_id in RUN_IDS:
    path = RESULTS / f"{run_id}/dfg_in"
    auc_df = pd.read_csv(path / "auc.csv") 
    auc_dfs[run_id] = auc_df[["15", "100", "110", "101", "111"]]
    
    mean = auc_dfs[run_id].describe().loc["mean", :]
    mean.name = run_id
    mean_df.append(mean)
    
    median = auc_dfs[run_id].describe().loc["50%", :]
    median.name = run_id
    median_df.append(median)
    
    std = auc_dfs[run_id].describe().loc["std", :]
    std.name = run_id
    std_df.append(std)
mean_df = pd.concat(mean_df, axis=1)
median_df = pd.concat(median_df, axis=1)
std_df = pd.concat(std_df, axis=1)

### Mean

In [6]:
mean_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.729418,0.742915,0.7414,0.739638,0.728245,0.739012
100,0.737368,0.742814,0.740444,0.742814,0.737743,0.740444
110,0.732148,0.742441,0.742148,0.741624,0.735506,0.745352
101,0.716671,0.740182,0.739224,0.733893,0.692715,0.734812
111,0.707904,0.739671,0.737161,0.733173,0.691946,0.732063


In [7]:
mean_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.729418,0.742915,0.7414,0.739638,0.728245,0.739012
100,0.737368,0.742814,0.740444,0.742814,0.737743,0.740444
110,0.732148,0.742441,0.742148,0.741624,0.735506,0.745352
101,0.716671,0.740182,0.739224,0.733893,0.692715,0.734812
111,0.707904,0.739671,0.737161,0.733173,0.691946,0.732063


In [8]:
mean_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.729418,0.742915,0.7414,0.739638,0.728245,0.739012
100,0.737368,0.742814,0.740444,0.742814,0.737743,0.740444
110,0.732148,0.742441,0.742148,0.741624,0.735506,0.745352
101,0.716671,0.740182,0.739224,0.733893,0.692715,0.734812
111,0.707904,0.739671,0.737161,0.733173,0.691946,0.732063


In [9]:
mean_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.729418,0.742915,0.7414,0.739638,0.728245,0.739012
100,0.737368,0.742814,0.740444,0.742814,0.737743,0.740444
110,0.732148,0.742441,0.742148,0.741624,0.735506,0.745352
101,0.716671,0.740182,0.739224,0.733893,0.692715,0.734812
111,0.707904,0.739671,0.737161,0.733173,0.691946,0.732063


### Median

In [10]:
median_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.769479,0.787637,0.779823,0.787594,0.764674,0.784618
100,0.786876,0.795961,0.784854,0.795961,0.779133,0.784854
110,0.766542,0.793199,0.79016,0.790582,0.777778,0.784037
101,0.753931,0.789474,0.781529,0.776316,0.701081,0.778246
111,0.726319,0.784902,0.774071,0.776316,0.701876,0.775046


In [11]:
median_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.769479,0.787637,0.779823,0.787594,0.764674,0.784618
100,0.786876,0.795961,0.784854,0.795961,0.779133,0.784854
110,0.766542,0.793199,0.79016,0.790582,0.777778,0.784037
101,0.753931,0.789474,0.781529,0.776316,0.701081,0.778246
111,0.726319,0.784902,0.774071,0.776316,0.701876,0.775046


In [12]:
median_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.769479,0.787637,0.779823,0.787594,0.764674,0.784618
100,0.786876,0.795961,0.784854,0.795961,0.779133,0.784854
110,0.766542,0.793199,0.79016,0.790582,0.777778,0.784037
101,0.753931,0.789474,0.781529,0.776316,0.701081,0.778246
111,0.726319,0.784902,0.774071,0.776316,0.701876,0.775046


In [13]:
median_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.769479,0.787637,0.779823,0.787594,0.764674,0.784618
100,0.786876,0.795961,0.784854,0.795961,0.779133,0.784854
110,0.766542,0.793199,0.79016,0.790582,0.777778,0.784037
101,0.753931,0.789474,0.781529,0.776316,0.701081,0.778246
111,0.726319,0.784902,0.774071,0.776316,0.701876,0.775046


### Standard deviation

In [14]:
std_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.146028,0.146782,0.147242,0.144187,0.141115,0.145573
100,0.154774,0.151072,0.150506,0.151072,0.150279,0.150506
110,0.148565,0.149837,0.148428,0.148939,0.145607,0.148299
101,0.140226,0.144862,0.14485,0.142372,0.127371,0.143996
111,0.138169,0.143709,0.14358,0.141556,0.126978,0.14361


In [15]:
std_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.146028,0.146782,0.147242,0.144187,0.141115,0.145573
100,0.154774,0.151072,0.150506,0.151072,0.150279,0.150506
110,0.148565,0.149837,0.148428,0.148939,0.145607,0.148299
101,0.140226,0.144862,0.14485,0.142372,0.127371,0.143996
111,0.138169,0.143709,0.14358,0.141556,0.126978,0.14361


In [16]:
std_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.146028,0.146782,0.147242,0.144187,0.141115,0.145573
100,0.154774,0.151072,0.150506,0.151072,0.150279,0.150506
110,0.148565,0.149837,0.148428,0.148939,0.145607,0.148299
101,0.140226,0.144862,0.14485,0.142372,0.127371,0.143996
111,0.138169,0.143709,0.14358,0.141556,0.126978,0.14361


In [17]:
std_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210712,20210804-1,20210804-2,20210804-3,20210804-4,20210804-5
15,0.146028,0.146782,0.147242,0.144187,0.141115,0.145573
100,0.154774,0.151072,0.150506,0.151072,0.150279,0.150506
110,0.148565,0.149837,0.148428,0.148939,0.145607,0.148299
101,0.140226,0.144862,0.14485,0.142372,0.127371,0.143996
111,0.138169,0.143709,0.14358,0.141556,0.126978,0.14361


## Tree distances per run/weighting

In [18]:
def get_tree_distance_matrix(tree):
    """
    Get a matrix of all-against-all kinase distances in the kinase tree.
    """
    
    kinases = [clade.name for clade in kissim_tree.get_terminals()]

    tree_distance_matrix = []
    
    for kinase1 in kinases:
        tree_distances_vector = []
        for kinase2 in kinases:
            tree_distances_vector.append(tree.distance(kinase1, kinase2))
        tree_distance_matrix.append(tree_distances_vector)
    
    tree_distance_matrix = pd.DataFrame(tree_distance_matrix, index=kinases, columns=kinases)
    
    return tree_distance_matrix

def get_ranks(tree_distance_matrix, rank_from, rank_to):
    """
    Get ranks for all kinases w.r.t. to a query kinase.
    """
    
    ranks = pd.concat(
        [
            tree_distance_matrix[rank_from].sort_values(),
            tree_distance_matrix[rank_from].sort_values().rank()
        ],
        axis=1
    )
    ranks.columns = ["distance", "rank"]
    if rank_to is not None:
        ranks = ranks.loc[rank_to, :]
    return ranks

In [19]:
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

In [20]:
%%time

results_list = []

for run_id in RUN_IDS:
    for weighting in WEIGHTING_SCHEMES:
        for cmethod in CLUSTERING_METHODS:
            results = []
            results.extend([run_id, weighting, cmethod])
            print(results)
            
            tree_path = RESULTS / f"{run_id}/dfg_in/trees/tree_0.8_{weighting}_{cmethod}.tree"
            kissim_tree = Phylo.read(tree_path, "newick")
            tree_distance_matrix = get_tree_distance_matrix(kissim_tree)
            
            ranks = get_ranks(tree_distance_matrix, "EGFR", ["SLK", "LOK", "GAK"])
            results.extend(ranks["rank"].to_list())
            ranks = get_ranks(tree_distance_matrix, "DRAK2", ["CaMKK2"])
            results.extend(ranks["rank"].to_list())
            
            results_list.append(results)

['20210712', '15', 'ward']
['20210712', '15', 'average']
['20210712', '15', 'weighted']
['20210712', '110', 'ward']
['20210712', '110', 'average']
['20210712', '110', 'weighted']
['20210712', '101', 'ward']
['20210712', '101', 'average']
['20210712', '101', 'weighted']
['20210712', '100', 'ward']
['20210712', '100', 'average']
['20210712', '100', 'weighted']
['20210804-1', '15', 'ward']
['20210804-1', '15', 'average']
['20210804-1', '15', 'weighted']
['20210804-1', '110', 'ward']
['20210804-1', '110', 'average']
['20210804-1', '110', 'weighted']
['20210804-1', '101', 'ward']
['20210804-1', '101', 'average']
['20210804-1', '101', 'weighted']
['20210804-1', '100', 'ward']
['20210804-1', '100', 'average']
['20210804-1', '100', 'weighted']
['20210804-2', '15', 'ward']
['20210804-2', '15', 'average']
['20210804-2', '15', 'weighted']
['20210804-2', '110', 'ward']
['20210804-2', '110', 'average']
['20210804-2', '110', 'weighted']
['20210804-2', '101', 'ward']
['20210804-2', '101', 'average']


In [30]:
results_df = pd.DataFrame(
    results_list, 
    columns=["run_id", "weighting", "cmethod", "EGFR-SLK", "EGFR-LOK", "EGFR-GAK", "DRAK2-CaMKK2"]
)

In [26]:
results_df.to_csv("tree_ranks.csv", index=None)

In [31]:
results_df = results_df.set_index(["run_id", "weighting", "cmethod"])

In [46]:
cm = sns.light_palette("blue", as_cmap=True, reverse=True)
results_df.style.applymap(lambda x: 'background-color : yellow' if x < 10 else '')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,DRAK2-CaMKK2
run_id,weighting,cmethod,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20210712,15,ward,241.5,241.5,58.5,2.0
20210712,15,average,115.0,115.0,210.5,2.0
20210712,15,weighted,131.0,131.0,230.0,12.0
20210712,110,ward,184.0,184.0,184.0,23.0
20210712,110,average,156.0,156.0,182.0,14.5
20210712,110,weighted,186.5,186.5,88.5,10.0
20210712,101,ward,225.0,225.0,225.0,2.0
20210712,101,average,68.5,68.5,212.0,2.0
20210712,101,weighted,89.5,89.5,211.0,2.0
20210712,100,ward,222.0,222.0,222.0,22.0
