In [1]:
import os
from pathlib import Path

if "PROJECT_ROOT" not in globals():
    PROJECT_ROOT = Path.cwd().parent.resolve()

os.chdir(PROJECT_ROOT)

In [None]:
import pyrepseq as prs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from paths import DATA_DIR
import seaborn as sns
from pyrepseq.metric.tcr_metric.tcrdist.tcrdist_metric import Tcrdist
from utils import ModelForAnalysis

plt.style.use("ggplot")
plt.style.use("my.mplstyle")

In [3]:
models = (
    ModelForAnalysis("SCEPTR (finetuned)", "ovr_predetermined_split_avg_dist", "#5f3dc4", "P", zorder=2),
    ModelForAnalysis("SCEPTR", "ovr_predetermined_split_nn", "#7048e8", "d", zorder=1.9),
    ModelForAnalysis("TCRdist", "ovr_predetermined_split_nn", "#f03e3e", "o", zorder=1.8),
    ModelForAnalysis("TCR-BERT", "ovr_predetermined_split_nn", "#74b816", "s"),
)

aurocs = []

for model in models:
    aurocs.append(model.load_data().set_index("epitope"))

for model, aucs in zip(models, aurocs):
    aucs.columns = [model.name]

aurocs = pd.concat(aurocs, axis="columns")
aurocs["avg"] = aurocs.apply(lambda row: row.mean(), axis="columns")
aurocs = aurocs.sort_values(by="avg", ascending=False)
aurocs = aurocs.reset_index()

In [4]:
vdjdb = pd.read_csv(DATA_DIR/"preprocessed"/"benchmarking"/"vdjdb_cleaned.csv")
vdjdb_used = vdjdb[vdjdb["Epitope"].isin(aurocs["epitope"])].copy()

In [5]:
vdjdb_used['Alpha'] = vdjdb_used['TRAV'] + vdjdb_used['CDR3A'] + vdjdb_used['TRAJ']
vdjdb_used['Beta'] = vdjdb_used['TRBV'] + vdjdb_used['CDR3B'] + vdjdb_used['TRBJ']
vdjdb_used['Full sequence'] = vdjdb_used['Alpha'] + vdjdb_used['Beta']

## Feature entropy vs performance 

In [6]:
epitope_data = pd.DataFrame(index=aurocs['epitope'])
features = ['TRAJ', 'TRBJ', 'TRAV', 'TRBV', 'CDR3A', 'CDR3B']
for feature in features:
    epitope_data[f'Entropy {feature}'] = vdjdb_used.groupby('Epitope').apply(lambda x: prs.renyi2_entropy(x, feature), include_groups=False)
    epitope_data[f'std Entropy {feature}'] = vdjdb_used.groupby('Epitope').apply(lambda x: prs.stdrenyi2_entropy(x, feature), include_groups=False)
for column in aurocs.columns[1:]:
    epitope_data[f"AUROC {column}"] = aurocs.set_index('epitope')[column]
epitope_data = epitope_data.sort_values('AUROC SCEPTR', ascending=False)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, sharex=True, sharey="col", figsize=(17/2.54, 15/2.54), layout='tight')
for i, feature in enumerate(features):
    col = i // 2
    row = i % 2

    shape = {
        0: "o",
        1: "s",
        2: "x"
    }[col]

    colour = {
        0: "C0",
        1: "C1"
    }[row]

    ax[row, col].errorbar(
        epitope_data['AUROC SCEPTR (finetuned)'],
        epitope_data[f'Entropy {feature}'],
        yerr=epitope_data[f'std Entropy {feature}'],
        fmt=shape,
        c=colour,
        markersize=5
    )

    if row == 1:
        ax[row, col].set_xlabel('AUROC SCEPTR (finetuned)')
    
    if col == 0:
        ax[row, col].set_ylabel('$H_2(X|\pi)$ [bits]')

    ax[row, col].title.set_text(f'{feature}')

for i in range(0, 4):
    ax[i % 2, i // 2].set_ylim(0, 6)

for i in range(4, 6):
    ax[i % 2, i // 2].set_ylim(0, 15)

# fig.supylabel("$H_2(X|\pi)$ [bits]")
# fig.supxlabel("AUROC SCEPTR (finetuned)")
fig.tight_layout()
fig.savefig("feature_entropy_vs_performance.pdf", bbox_inches="tight")

## Integrated near coincidence entropy vs performance

In [8]:
tcrdist = Tcrdist()

In [9]:
scores = epitope_data['AUROC SCEPTR (finetuned)']
ranked_scores = scores.rank()
normalized_ranks = (ranked_scores - ranked_scores.min()) / (ranked_scores.max() - ranked_scores.min())
cmap = sns.color_palette("viridis", as_cmap=True)

In [None]:
fig, ax = plt.subplots(figsize=(10/2.54, 8/2.54), layout = 'tight', sharex=True)
epitope_data['tcrdist_scores'] =  vdjdb_used.groupby('Epitope').apply(lambda x: tcrdist.calc_pdist_vector(x), include_groups=False)

for epitope in epitope_data.index:
    x = np.sort(epitope_data.loc[epitope]['tcrdist_scores'])
    y = np.arange(len(x))/len(x)
    ax.plot(x, y, label = f"[{epitope}] {epitope_data.loc[epitope]['AUROC SCEPTR (finetuned)']:.2f}", c=cmap(normalized_ranks[epitope]))
ax.set_xlabel('TCRdist distance $\delta$')
ax.set_ylabel('Cumulative probability $P_c(\delta)$')
ax.set_yscale('log')
ax.legend(title = '[Epitope] AUROC SCEPTR (finetuned)')
fig.savefig("cumulative_pc_vs_performance.pdf", bbox_inches="tight")