In [1]:
import os
from pathlib import Path

if "PROJECT_ROOT" not in globals():
    PROJECT_ROOT = Path.cwd().parent.resolve()

os.chdir(PROJECT_ROOT)

In [None]:
from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from paths import DATA_DIR
from pyrepseq.metric import tcr_metric
from sceptr import variant
import seaborn as sns
from scipy import stats

plt.style.use("ggplot")
plt.style.use("my.mplstyle")

In [3]:
# Load Tanno test data and take small subsample
tanno_test = pd.read_csv(DATA_DIR/"preprocessed"/"tanno"/"test.csv")
tanno_sample = tanno_test.sample(n=1000, random_state=420)

In [None]:
tanno_sample.head()

In [5]:
# Load in all necessary models
sceptr_model = variant.default()
tcrdist_model = tcr_metric.Tcrdist()
tcrdist_a_model = tcr_metric.AlphaTcrdist()
tcrdist_b_model = tcr_metric.BetaTcrdist()

In [None]:
# Compute pdists
sceptr_pdist = sceptr_model.calc_pdist_vector(tanno_sample)
tcrdist_pdist = tcrdist_model.calc_pdist_vector(tanno_sample)
tcrdist_a_pdist = tcrdist_a_model.calc_pdist_vector(tanno_sample)
tcrdist_b_pdist = tcrdist_b_model.calc_pdist_vector(tanno_sample)

### Overview plot

In [7]:
def random_subsample_indices(k: int, out_of: int):
    return np.random.choice(out_of, k, replace=False)

In [8]:
coords = np.vstack([sceptr_pdist, tcrdist_pdist])
coords_1k = coords[:, random_subsample_indices(1000, len(tanno_sample))]
gaussian_kde = stats.gaussian_kde(coords_1k)
density_estimates = gaussian_kde(coords)

In [None]:
plt.figure(figsize=(4,4))

plt.scatter(*coords, s=1, c=density_estimates)

plt.xlabel("SCEPTR distance")
plt.ylabel("TCRdist distance")
cb = plt.colorbar()
cb.set_label("Estimated density")

plt.show()

### Why do some pairs with similar TCRdist have different SCEPTR dists?

In [10]:
# Calculate average p_Gen
pgens = tanno_sample.apply(
    lambda row: row["alpha_pgen"] * row["beta_pgen"],
    axis="columns"
).to_numpy()

num_tcrs = len(tanno_sample)
avg_pgens = np.empty(num_tcrs * (num_tcrs - 1) // 2)
min_pgens = np.empty(num_tcrs * (num_tcrs - 1) // 2)

pair_idx = 0
for anchor_idx in range(num_tcrs-1):
    for comparison_idx in range(anchor_idx+1, num_tcrs):
        avg_pgens[pair_idx] = (pgens[anchor_idx] + pgens[comparison_idx]) / 2
        min_pgens[pair_idx] = min(pgens[anchor_idx], pgens[comparison_idx])
        pair_idx += 1

In [None]:
plt.figure(figsize=(4,4))

plt.scatter(sceptr_pdist, tcrdist_pdist, s=1, c=min_pgens, norm=mcolors.LogNorm())

plt.xlabel("SCEPTR distance")
plt.ylabel("TCRdist distance")
cb = plt.colorbar()
cb.set_label(r"$p_{Gen}$")

plt.show()

In [None]:
tcrdist_min_chain = np.min(np.stack([tcrdist_a_pdist, tcrdist_b_pdist], axis=1), axis=1)
tcrdist_max_chain = np.max(np.stack([tcrdist_a_pdist, tcrdist_b_pdist], axis=1), axis=1)

plt.figure(figsize=(4,4))

plt.scatter(sceptr_pdist, tcrdist_pdist, s=1, c=tcrdist_max_chain - tcrdist_min_chain)

plt.xlabel("SCEPTR distance")
plt.ylabel("TCRdist distance")
cb = plt.colorbar()
cb.set_label(r"Difference between $\alpha$ and $\beta$ chain TCRdist distances")

plt.show()

In [19]:
def plot_line_best_fit(x, y, ax = None, log_y = False):
    if ax is None:
        ax = plt.gca()

    if log_y:
        y = np.log(y)

    w = np.linalg.lstsq(np.vstack([x, np.ones(len(x))]).T, y, rcond=None)[0]

    xx = np.linspace(*plt.gca().get_xlim()).T
    yy = w[0]*xx + w[1]

    if log_y:
        yy = np.exp(yy)

    ax.plot(xx, yy)

In [None]:
# Pairs that are close according to SCEPTR
non_zero_pgen = min_pgens > 0
close_sceptr_mask = (sceptr_pdist >= 0.98) * (sceptr_pdist <= 1.02) * non_zero_pgen

plt.figure(figsize=(4,4))

plt.scatter(tcrdist_pdist[close_sceptr_mask], np.log10(min_pgens[close_sceptr_mask]))
plot_line_best_fit(tcrdist_pdist[close_sceptr_mask], np.log10(min_pgens[close_sceptr_mask]))

cor_results = stats.pearsonr(tcrdist_pdist[close_sceptr_mask], np.log10(min_pgens[close_sceptr_mask]))
plt.text(100,-28,f"$r = {cor_results.statistic:.3f}$\n$p = {cor_results.pvalue:.2e}$")

plt.xlabel("TCRdist distance")
plt.ylabel(r"$\log_{10}(p_{Gen})$")

plt.show()

In [None]:
# Pairs that are close according to TCRdist
close_tcrdist_mask = (tcrdist_pdist >= 145) * (tcrdist_pdist <= 155) * non_zero_pgen

plt.figure(figsize=(4,4))

plt.scatter(sceptr_pdist[close_tcrdist_mask], np.log10(min_pgens[close_tcrdist_mask]))
plot_line_best_fit(sceptr_pdist[close_tcrdist_mask], np.log10(min_pgens[close_tcrdist_mask]))

cor_results = stats.pearsonr(sceptr_pdist[close_tcrdist_mask], np.log10(min_pgens[close_tcrdist_mask]))
plt.text(0.9,-29,f"$r = {cor_results.statistic:.3f}$\n$p = {cor_results.pvalue:.3f}$")

plt.xlabel("SCEPTR distance")
plt.ylabel(r"$\log_{10}(p_{Gen})$")

plt.show()

In [63]:
# Investigate power means
alphas = np.linspace(1-50,1+50,101)
rhos = [
    stats.pearsonr(
        sceptr_pdist,
        stats.pmean(
            np.vstack([tcrdist_a_pdist, tcrdist_b_pdist]),
            p=alpha,
            axis=0
        )
    ).statistic
    for alpha in alphas
]

In [None]:
plt.figure(figsize=(4,4))
plt.plot(alphas, rhos)

plt.xlabel('Power mean exponent')
plt.ylabel('Pearson r')