# Summary of AUC distributions and tree distances

Based on for the 20210810 runs.

In [1]:
from pathlib import Path

from IPython.display import display, HTML
import pandas as pd
import seaborn as sns
from Bio import Phylo

In [2]:
cm = sns.light_palette("blue", as_cmap=True)

In [3]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / f"../../results_archive"

## Run IDs

In [4]:
RUN_IDS = [
    "20210810-1",
    "20210810-2",
]
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

## AUCs per run/weighting

In [5]:
auc_dfs = {}
mean_df = []
median_df = []
std_df = []
for run_id in RUN_IDS:
    path = RESULTS / f"{run_id}/dfg_in"
    auc_df = pd.read_csv(path / "auc.csv") 
    auc_dfs[run_id] = auc_df[["15", "100", "110", "101", "111"]]
    
    mean = auc_dfs[run_id].describe().loc["mean", :]
    mean.name = run_id
    mean_df.append(mean)
    
    median = auc_dfs[run_id].describe().loc["50%", :]
    median.name = run_id
    median_df.append(median)
    
    std = auc_dfs[run_id].describe().loc["std", :]
    std.name = run_id
    std_df.append(std)
mean_df = pd.concat(mean_df, axis=1)
median_df = pd.concat(median_df, axis=1)
std_df = pd.concat(std_df, axis=1)

### Mean

In [6]:
mean_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210810-1,20210810-2
15,0.732158,0.734549
100,0.734574,0.733362
110,0.733351,0.735241
101,0.728847,0.732
111,0.72918,0.733298


In [7]:
mean_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.732158,0.734549
100,0.734574,0.733362
110,0.733351,0.735241
101,0.728847,0.732
111,0.72918,0.733298


In [8]:
mean_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.732158,0.734549
100,0.734574,0.733362
110,0.733351,0.735241
101,0.728847,0.732
111,0.72918,0.733298


In [9]:
mean_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.732158,0.734549
100,0.734574,0.733362
110,0.733351,0.735241
101,0.728847,0.732
111,0.72918,0.733298


### Median

In [10]:
median_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210810-1,20210810-2
15,0.775239,0.776538
100,0.779885,0.776538
110,0.7696,0.776538
101,0.760425,0.775637
111,0.762348,0.776748


In [11]:
median_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.775239,0.776538
100,0.779885,0.776538
110,0.7696,0.776538
101,0.760425,0.775637
111,0.762348,0.776748


In [12]:
median_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.775239,0.776538
100,0.779885,0.776538
110,0.7696,0.776538
101,0.760425,0.775637
111,0.762348,0.776748


In [13]:
median_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.775239,0.776538
100,0.779885,0.776538
110,0.7696,0.776538
101,0.760425,0.775637
111,0.762348,0.776748


### Standard deviation

In [14]:
std_df.style.background_gradient(cmap=cm)

Unnamed: 0,20210810-1,20210810-2
15,0.14073,0.140837
100,0.145065,0.141067
110,0.143323,0.140535
101,0.136722,0.140287
111,0.136699,0.139429


In [15]:
std_df.style.highlight_max(axis=None, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.14073,0.140837
100,0.145065,0.141067
110,0.143323,0.140535
101,0.136722,0.140287
111,0.136699,0.139429


In [16]:
std_df.style.highlight_max(axis=0, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.14073,0.140837
100,0.145065,0.141067
110,0.143323,0.140535
101,0.136722,0.140287
111,0.136699,0.139429


In [17]:
std_df.style.highlight_max(axis=1, color='yellow')

Unnamed: 0,20210810-1,20210810-2
15,0.14073,0.140837
100,0.145065,0.141067
110,0.143323,0.140535
101,0.136722,0.140287
111,0.136699,0.139429


## Tree distances per run/weighting

In [18]:
def get_tree_distance_matrix(tree):
    """
    Get a matrix of all-against-all kinase distances in the kinase tree.
    """
    
    kinases = [clade.name for clade in kissim_tree.get_terminals()]

    tree_distance_matrix = []
    
    for kinase1 in kinases:
        tree_distances_vector = []
        for kinase2 in kinases:
            tree_distances_vector.append(tree.distance(kinase1, kinase2))
        tree_distance_matrix.append(tree_distances_vector)
    
    tree_distance_matrix = pd.DataFrame(tree_distance_matrix, index=kinases, columns=kinases)
    
    return tree_distance_matrix

def get_ranks(tree_distance_matrix, rank_from, rank_to):
    """
    Get ranks for all kinases w.r.t. to a query kinase.
    """
    
    ranks = pd.concat(
        [
            tree_distance_matrix[rank_from].sort_values(),
            tree_distance_matrix[rank_from].sort_values().rank()
        ],
        axis=1
    )
    ranks.columns = ["distance", "rank"]
    if rank_to is not None:
        ranks = ranks.loc[rank_to, :]
    return ranks

In [19]:
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

In [20]:
%%time

results_list = []

for run_id in RUN_IDS:
    for weighting in WEIGHTING_SCHEMES:
        for cmethod in CLUSTERING_METHODS:
            results = []
            results.extend([run_id, weighting, cmethod])
            print(results)
            
            tree_path = RESULTS / f"{run_id}/dfg_in/trees/tree_0.8_{weighting}_{cmethod}.tree"
            kissim_tree = Phylo.read(tree_path, "newick")
            tree_distance_matrix = get_tree_distance_matrix(kissim_tree)
            
            ranks = get_ranks(tree_distance_matrix, "EGFR", ["SLK", "LOK", "GAK"])
            results.extend(ranks["rank"].to_list())
            ranks = get_ranks(tree_distance_matrix, "DRAK2", ["CaMKK2"])
            results.extend(ranks["rank"].to_list())
            
            results_list.append(results)

['20210810-1', '15', 'ward']
['20210810-1', '15', 'average']
['20210810-1', '15', 'weighted']
['20210810-1', '110', 'ward']
['20210810-1', '110', 'average']
['20210810-1', '110', 'weighted']
['20210810-1', '101', 'ward']
['20210810-1', '101', 'average']
['20210810-1', '101', 'weighted']
['20210810-1', '100', 'ward']
['20210810-1', '100', 'average']
['20210810-1', '100', 'weighted']
['20210810-2', '15', 'ward']
['20210810-2', '15', 'average']
['20210810-2', '15', 'weighted']
['20210810-2', '110', 'ward']
['20210810-2', '110', 'average']
['20210810-2', '110', 'weighted']
['20210810-2', '101', 'ward']
['20210810-2', '101', 'average']
['20210810-2', '101', 'weighted']
['20210810-2', '100', 'ward']
['20210810-2', '100', 'average']
['20210810-2', '100', 'weighted']
CPU times: user 8min 22s, sys: 0 ns, total: 8min 22s
Wall time: 8min 22s


In [21]:
results_df = pd.DataFrame(
    results_list, 
    columns=["run_id", "weighting", "cmethod", "EGFR-SLK", "EGFR-LOK", "EGFR-GAK", "DRAK2-CaMKK2"]
)

In [22]:
results_df.to_csv("tree_ranks_20210810.csv", index=None)

In [23]:
results_df = results_df.set_index(["run_id", "weighting", "cmethod"])

In [24]:
cm = sns.light_palette("blue", as_cmap=True, reverse=True)
results_df.style.applymap(lambda x: 'background-color : yellow' if x < 10 else '')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,DRAK2-CaMKK2
run_id,weighting,cmethod,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20210810-1,15,ward,211.0,211.0,162.5,10.0
20210810-1,15,average,96.5,96.5,152.0,10.0
20210810-1,15,weighted,120.0,120.0,162.0,2.0
20210810-1,110,ward,149.0,149.0,149.0,18.5
20210810-1,110,average,84.0,84.0,134.5,7.5
20210810-1,110,weighted,57.0,57.0,71.5,19.0
20210810-1,101,ward,74.0,74.0,155.5,22.0
20210810-1,101,average,109.0,109.0,185.5,12.0
20210810-1,101,weighted,149.5,149.5,210.0,2.0
20210810-1,100,ward,136.0,136.0,136.0,20.5
