In [None]:
import os
from pathlib import Path

if "PROJECT_ROOT" not in globals():
    PROJECT_ROOT = Path.cwd().parent.resolve()

os.chdir(PROJECT_ROOT)

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from paths import DATA_DIR
from pyrepseq.metric import tcr_metric
from sceptr import variant
from scipy import stats

plt.style.use("ggplot")
plt.style.use("my.mplstyle")

In [None]:
# Sample a set of TCRs from the Tanno et al. test subset
tanno_test = pd.read_csv(DATA_DIR/"preprocessed"/"tanno"/"test.csv")
subsample = tanno_test.sample(n=1000, random_state=420)

# Measure pdist of this TCR set using both SCEPTR and TCRdist
sceptr_model = variant.default()
tcrdist_model = tcr_metric.Tcrdist()

sceptr_pdist = sceptr_model.calc_pdist_vector(subsample)
tcrdist_pdist = tcrdist_model.calc_pdist_vector(subsample)

In [None]:
# Calculate density estimates
coords = np.vstack([sceptr_pdist, tcrdist_pdist])
coords_subsampled = coords[:, np.random.choice(coords.shape[1], 1000, replace=False)]
gaussian_kde = stats.gaussian_kde(coords_subsampled)
density_estimates = gaussian_kde(coords)

In [None]:
plt.figure(figsize=(4,4))
plt.scatter(sceptr_pdist, tcrdist_pdist, s=1, c=density_estimates)
plt.xlabel("SCEPTR distance")
plt.ylabel("TCRdist distance")
plt.show()