# Evaluate performace of `kissim` vs. profiling datasets

Summarize ligand-kinase pair performances based on multiple profiling datasets:

- Profiling datasets: Karaman and Davis
- `kissim` datasets: Different feature weighting schemes
- Saves AUC values per setting combination

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import logging
import math
import itertools

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, HTML

from src import data
from src.evaluation.ligand_vs_kinase_evaluator import LigandVsKinaseEvaluator



In [3]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [4]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = HERE / "../../results_archive"

## Define parameters

In [5]:
LIGAND_PROFILING_METHODS = ["karaman-davis"]
KINASE_DISTANCES_METHOD = "kissim"
KINASE_ACTIVITY_CUTOFF = 100
KINASE_ACTIVITY_MAX = True
MIN_N_SHARED_KINASES = 10
MIN_N_SHARED_ACTIVE_KINASES = 3

In [6]:
RUN_IDS = [
    "20210712",
    "20210804-1",
    "20210804-2",
    "20210804-3",
    "20210804-4",
    "20210804-5",
    "20210810-1",
    "20210810-2",
    "20210812-1",
    "20210812-2",
]
DATA_SUBSETS = ["dfg_in"]
WEIGHTING_SCHEMES = ["15", "100", "010", "001", "110", "011", "101", "111"]

## Generate ROC/AUC data

In [7]:
auc_dict = {}

for ligand_profiling_method in LIGAND_PROFILING_METHODS:
    display(Markdown(f"### Ligand profiling method: {ligand_profiling_method}"))
    auc_dict[ligand_profiling_method] = {}

    for run_id in RUN_IDS:
        auc_dict[ligand_profiling_method][run_id] = {}

        for data_subset in DATA_SUBSETS:
            auc_dict[ligand_profiling_method][run_id][data_subset] = {}
            display(Markdown(f"#### {run_id} | {data_subset}"))

            for weighting in WEIGHTING_SCHEMES:
                auc_dict[ligand_profiling_method][run_id][data_subset][weighting] = {}

                # Set path to file
                display(Markdown(f"##### {weighting}"))
                if weighting == "15":
                    KINASE_KINASE_PATH = (
                        RESULTS / f"{run_id}/{data_subset}/fingerprint_distances.csv"
                    )
                else:
                    KINASE_KINASE_PATH = (
                        RESULTS / f"{run_id}/{data_subset}/fingerprint_distances_{weighting}.csv"
                    )

                # Kinase-kinase distances dataset
                kissim_df = data.distances.kissim(
                    structure_kinase_mapping_by="minimum",
                    kinmap_kinases=True,
                    distances_path=KINASE_KINASE_PATH,
                )
                # Kinase-ligand profiling dataset
                profiling_df = data.profiling.load(
                    ligand_profiling_method, pkidb_ligands=True, fda_approved=True
                )

                ligand_names = profiling_df.columns
                ligand_targets = data.targets.pkidb(ligand_names, fda_approved=False)
                ligand_kinase_pairs = (
                    ligand_targets[["ligand.input", "targets.kinmap"]]
                    .explode("targets.kinmap")
                    .to_numpy()
                    .tolist()
                )
                evaluator = LigandVsKinaseEvaluator(
                    ligand_kinase_pairs,
                    ligand_profiling_method,
                    KINASE_DISTANCES_METHOD,
                    KINASE_ACTIVITY_CUTOFF,
                    KINASE_ACTIVITY_MAX,
                    MIN_N_SHARED_KINASES,
                    MIN_N_SHARED_ACTIVE_KINASES,
                    kinase_kinase_path=KINASE_KINASE_PATH,
                )
                try:
                    auc_list = evaluator.plot_roc_curves()
                    plt.show()
                    auc_dict[ligand_profiling_method][run_id][data_subset][weighting] = auc_list
                except ValueError as e:
                    print(e)

### Ligand profiling method: karaman-davis

#### 20210712 | dfg_in

##### 15

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_in/fingerprint_distances.csv'

## Plot AUC distribution

In [8]:
for ligand_profiling_method, run_id_dict in auc_dict.items():
    display(Markdown(f"### Ligand profiling method: {ligand_profiling_method}"))
    for run_id, data_subset_dict in run_id_dict.items():
        for data_subset, weighting_dict in data_subset_dict.items():
            display(Markdown(f"#### {run_id} | {data_subset}"))
            weighting_dict = {
                key: values for key, values in weighting_dict.items() if values != {}
            }
            auc_df = pd.DataFrame(weighting_dict)

            # Save to file
            auc_df.to_csv(RESULTS / f"{run_id}/{data_subset}/auc.csv", index=None)
            # Plot
            auc_df.plot(kind="box", title=f"{data_subset} ({auc_df.shape[0]} kinase-ligand pairs)")
            plt.show()
            # Show descriptive stats
            display(HTML(auc_df.describe().to_html()))

### Ligand profiling method: karaman-davis

#### 20210712 | dfg_in

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/notebooks/007_kissim_setups/../../results_archive/20210712/dfg_in/auc.csv'

## Scatter AUC values between weighting schemes

In [9]:
def show_auc_scatter(
    auc_dict,
    run_id,
    ligand_profiling_method="karaman-davis",
    data_subset="dfg_in",
    feature_weights=["15", "100", "110", "101", "111"],
):

    auc_by_weighting_dict = auc_dict[ligand_profiling_method][run_id][data_subset]
    auc_df = pd.DataFrame(auc_by_weighting_dict)
    for profiling_name, auc in auc_dict.items():

        display(Markdown(f"### Ligand profiling method: {ligand_profiling_method}"))
        pairs = list(itertools.combinations(feature_weights, 2))
        n_cols = 5
        n_rows = math.ceil(len(pairs) / n_cols)
        _, axes = plt.subplots(figsize=(n_cols * 5, n_rows * 5), nrows=n_rows, ncols=n_cols)
        axes = axes.reshape(-1)

        for i, pair in enumerate(pairs):

            axes[i].plot([0, 1], [0, 1], "grey")
            axes[i].plot(auc_df[pair[0]], auc_df[pair[1]], ".")
            axes[i].set_box_aspect(1)
            axes[i].set_xlim([0, 1])
            axes[i].set_ylim([0, 1])
            axes[i].set_xlabel(f"AUCs from {pair[0]}")
            axes[i].set_ylabel(f"AUCs from {pair[1]}")

        plt.show()

In [10]:
show_auc_scatter(auc_dict, "20210812-1")

KeyError: '20210812-1'