In [None]:
from pathlib import Path

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import vizta

from umap import UMAP

Path("figures").mkdir(exist_ok=True)
pal = vizta.mpl.set_theme(context="poster", style="talusbio")

base_path = Path("../data/colab/psm-embedding/")
spectra = torch.load(base_path / "spectra.pt")
peptides = torch.load(base_path / "peptides.pt")

In [None]:
n_spec = len(spectra)
reducer = UMAP(random_state=42, n_components=2, metric="cosine")
spec_emb = reducer.fit_transform(spectra.numpy())
print("Embedding peptides...")
pep_emb = reducer.transform(peptides.numpy())

In [None]:
torch.manual_seed(42)
cosine_sim_target = F.cosine_similarity(spectra, peptides)
rand_spec = spectra[torch.randint(spectra.shape[0], size=(spectra.shape[0],)), :]
rand_pep = peptides[torch.randint(spectra.shape[0], size=(spectra.shape[0],)), :]
cosine_sim_decoy = F.cosine_similarity(rand_spec, rand_pep)

sim_df = pd.DataFrame({"sim": 1 - cosine_sim_target, "Label": "Matched Pairs"})
rand_df = pd.DataFrame({"sim": 1 - cosine_sim_decoy, "Label": "Random Pairs"})
sim_df = pd.concat([sim_df, rand_df])

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(26, 10.43))

ax = axs[1]
ax.scatter(spec_emb[:, 0], spec_emb[:, 1], s=1, label="Mass Spectra")
ax.scatter(pep_emb[:, 0], pep_emb[:, 1], s=1, label="Peptides")
ax.legend(frameon=False, loc="lower left", markerscale=4.0)
ax.axis("equal")
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("UMAP 1")
ax.set_ylabel("UMAP 2")
ax.grid(False)

ax = axs[0]
sns.ecdfplot(sim_df, x="sim", hue="Label", ax=ax, stat="count")
ax.set_xlabel("Cosine Distance")
ax.set_ylabel("Number of PSMs")
ax.grid(False)

plt.tight_layout()
plt.savefig("figures/embedding.png", dpi=300, transparent=True)