In [None]:
import torch

import numpy as np
import pandas as pd
import scipy.stats

import os
import seaborn as sns
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import matplotlib.patches

""" Profile """

# dataset
profile = pd.read_csv("data/stanford/profile.txt")
# Isolate string (i.e. su001) -> int (i.e. 1)
profile["Isolate"] = profile["Isolate"].apply(lambda x: int(x[2:]))
# Treatment pre/post -> 0/1
profile["Treatment"] = profile["Treatment"].apply(lambda x: int(not "pre" in x))
# Sort by Isolate (1 to 8), Treatment (pre to post), and Tissue (normal to BCC)
profile = profile.sort_values(
    by=["Isolate", "Treatment", "Tissue"], ascending=[True, True, False]
).reset_index(drop=True)
# only keep run, isolate, treatment, tissue
profile = profile[["Run", "Isolate", "Treatment", "Tissue"]]

""" Functions """

def euclideanDistance(x1: np.ndarray, x2: np.ndarray) -> np.ndarray:
    x1norm = np.sum(x1**2, axis=1, keepdims=True)           # (N1,  1)
    x2norm = np.sum(x2**2, axis=1, keepdims=True).T         # ( 1, N2)
    x1x2   = 2 * np.dot(x1, x2.T)                           # (N1, N2)
    return np.sqrt(np.maximum(0, x1norm + x2norm - x1x2))   # (N1, N2)

def pearsonCorrelation(x1: np.ndarray, x2: np.ndarray) -> np.ndarray:
    x1centered = x1 - x1.mean(axis=1, keepdims=True)        # (N1, 768)
    x2centered = x2 - x2.mean(axis=1, keepdims=True)        # (N2, 768)
    numerator = np.dot(x1centered, x2centered.T)            # (N1, N2)
    x1var = np.sum(x1centered**2, axis=1, keepdims=True)    # (N1,  1)
    x2var = np.sum(x2centered**2, axis=1, keepdims=True)    # (N2,  1)
    denominator = np.sqrt(np.dot(x1var, x2var.T))           # (N1, N2)
    return numerator / denominator                          # (N1, N2)

In [None]:
""" Embedding and UAMP """

method = "finetune"  # pretrain or finetune
isolates = [3, 7]
chromosome = "6"
pval_thresh = 1e-5

runs = []
for isolate in isolates:
    runs += profile[profile["Isolate"] == isolate]["Run"].to_list()

embd, umap = {}, {}
for run in runs:
    # load the embedding
    embd[run] = np.load(f"data/stanford/embd-{method}/{run}/{chromosome}.npy")
    # filter by p-value
    embd[run] = embd[run][embd[run][:, 769-int(np.log10(pval_thresh))]>=1, :]
    # only keep the first 768 columns, embedding
    embd[run] = embd[run][:, :768]
    # read umap of given sample id and chromosome
    umap[run] = np.load(f"data/stanford/umap-{method}/{run}/{chromosome}.npy")
    # filter reads that cover at least one variants with p-value<=pval_thresh
    umap[run] = umap[run][umap[run][:, 3-int(np.log10(pval_thresh))]>=1, :]
    # only keep the first 2 columns as umap coordinates
    umap[run] = umap[run][:, :2]

In [None]:
""" Plot Distribution of Euclidean Distance """

x1embd, x2embd = embd[runs[0]], embd[runs[1]]   # (N1, 768), (N2, 768)
distance_matrix = euclideanDistance(x1embd, x2embd)     # (N1, N2)

fig, axs = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
for i in range(2):
    axs[i].hist(np.max(distance_matrix, axis=int(not i)), bins=100)
    # title
    id, isolate, treatment, tissue = profile[profile["Run"] == runs[i]].iloc[0]
    treatment = "Pre" if treatment == 0 else "Post"
    tissue = "Normal" if "normal" in tissue else "Tumor"
    info = f"{id} - {isolate} - {treatment} - {tissue}"
    axs[i].set_title(f"{info}", fontweight='bold', fontsize=18)
fig.supxlabel("Euclidean Distance", fontweight='bold', y=0.004, fontsize=18)
fig.supylabel("Number of Reads", fontweight='bold', x=0.008, fontsize=18)
fig.tight_layout()
fig.show()

In [None]:
""" Plot UMAP """

top_k_reads = 4000

gridsize = 100
vmin, vmax = 2, 50
fontsize = 18

fig, axs = plt.subplots(6, 6, figsize=(30, 30), sharex=True, sharey=True)
for i in range(len(runs)):
    # set legend of samples
    id, isolate, treatment, tissue = profile[profile["Run"] == runs[i]].iloc[0]
    treatment = "Pre" if treatment == 0 else "Post"
    tissue = "Normal" if "normal" in tissue else "Tumor"
    info = f"{id} - {isolate} - {treatment} - {tissue}"
    axs[0,  i].set_xlabel(info, fontsize=fontsize, fontweight='bold')
    axs[0,  i].xaxis.set_label_position("top")
    axs[i, -1].set_ylabel(info, fontsize=fontsize, fontweight='bold')
    axs[i, -1].yaxis.set_label_position("right")
    # plot umpa of sample i before filter in diagonal
    axs[i, i].hexbin(
        umap[runs[i]][:, 0], umap[runs[i]][:, 1], 
        cmap="Reds", gridsize=gridsize, vmin=vmin, vmax=vmax,
    )
    axs[i, i].set_aspect("equal")
    axs[i, i].set_xlim(-22, 22)
    axs[i, i].set_ylim(-22, 22)
    # plot umap of (i, :) and (:, i) after filter by distance (i, j)
    for j in range(i+1, len(runs)):
        x1embd, x2embd = embd[runs[i]], embd[runs[j]]   # (N1, 768), (N2, 768)
        distance_matrix = euclideanDistance(x1embd, x2embd)     # (N1, N2)
        x1thresh = np.sort(np.max(distance_matrix, axis=1))[-top_k_reads]
        x2thresh = np.sort(np.max(distance_matrix, axis=0))[-top_k_reads]
        x1filter = np.max(distance_matrix, axis=1) >= x1thresh  # (N1, )
        x2filter = np.max(distance_matrix, axis=0) >= x2thresh  # (N2, )
        # umap of (i, :)
        axs[i, j].hexbin(
            umap[runs[i]][x1filter][:, 0], umap[runs[i]][x1filter][:, 1], 
            cmap="Reds", gridsize=gridsize, vmin=vmin, vmax=vmax,
        )
        axs[i, j].set_aspect("equal")
        axs[i, j].set_xlim(-22, 22)
        axs[i, j].set_ylim(-22, 22)
        # umap of (:, i)
        axs[j, i].hexbin(
            umap[runs[j]][x2filter][:, 0], umap[runs[j]][x2filter][:, 1], 
            cmap="Reds", gridsize=gridsize, vmin=vmin, vmax=vmax,
        )
        axs[j, i].set_aspect("equal")
        axs[j, i].set_xlim(-22, 22)
        axs[j, i].set_ylim(-22, 22)
fig.supxlabel("UMAP1", fontsize=fontsize, fontweight='bold', y=0.004)
fig.supylabel("UMAP2", fontsize=fontsize, fontweight='bold', x=0.008)
fig.tight_layout()
fig.savefig("temp.png", dpi=500)
plt.close()

In [None]:
""" hexbin map for all sample """

fig, axs = plt.subplots(
    4, 6, figsize=(30, 20), sharex=True, sharey=True, dpi=500
)
index = -1
for i, ax in enumerate(axs.flat):
    if i == 11:
        ax.axis("off")
        continue
    index += 1

    ax.hexbin(
        embedding_umap[profile["Run"][index]][:, 0], 
        embedding_umap[profile["Run"][index]][:, 1], 
        gridsize=150,
        cmap="Reds",
        vmin=2, vmax=80,
    )

    for spine in ax.spines.values():
        spine.set_color("r" if profile["Treatment"][index] == 0 else "b")
        spine.set_linewidth(2)

    ax.set_aspect("equal")
    ax.set_xlim(-16, 16)
    ax.set_ylim(-16, 16)
    ax.set_title(
        "Run: {}; ".format(profile["Run"][index]) + 
        "Isolate: {};".format(profile["Isolate"][index])
    )
fig.legend(
    handles=[
        matplotlib.patches.Patch(color="r", label="pre "), 
        matplotlib.patches.Patch(color="b", label="post")
    ], loc="upper right", ncol=2, fontsize=14,
)
fig.suptitle(
    f"Stanford Data; chromosome {chr}; SNPs p-val threshold (<=) {pval_thresh}", 
    fontweight='bold', y=0.992, fontsize=16
)
fig.supxlabel("UMAP1", fontweight='bold', y=0.005, fontsize=16)
fig.supylabel("UMAP2", fontweight='bold', x=0.010, fontsize=16)
fig.tight_layout()
fig.savefig(
    os.path.join(embedding_umap_fold, "hexbin.png"), 
    dpi=500,
)
plt.close()

In [None]:
""" two 1D distribution map """

x = np.linspace(-30, 30, 500)

for u in [0, 1]:
    base = scipy.stats.gaussian_kde(
        np.concatenate([embedding_umap[run][:, u] for run in profile["Run"]])
    )(x)
    fig, axs = plt.subplots(1, 8, figsize=(24, 6), sharex=True, sharey=True, dpi=500)
    for i in profile["Isolate"].unique():
        ax = axs.flat[i-1]
        for run in profile[profile["Isolate"] == i]["Run"]:
            density = scipy.stats.gaussian_kde(embedding_umap[run][:, u])(x)
            ax.plot(
                x, density - base, 
                c='r' if profile[profile["Run"]==run]["Treatment"].values == 0 else 'b'
            )
            ax.set_xlim(-25, 25)
            ax.set_ylim(-0.008, 0.008)
            ax.set_title(f"Isolate: {i}")
    fig.legend(
        handles=[
            matplotlib.patches.Patch(color="r", label="pre "), 
            matplotlib.patches.Patch(color="b", label="post")
        ], loc="upper right", ncol=2, fontsize=14,
    )
    fig.suptitle(
        f"Stanford Data; chromosome {chr}; SNPs p-val threshold (<=) {pval_thresh}", 
        fontweight='bold', y=0.992, fontsize=16
    )
    fig.supxlabel(f"UMAP{u+1}", fontweight='bold', y=0.005, fontsize=16)
    fig.supylabel("Density Different", fontweight='bold', x=0.010, fontsize=16)
    fig.tight_layout()
    fig.savefig(
        os.path.join(embedding_umap_fold, f"distribution{u+1}.png"), 
        dpi=500,
    )
    plt.close()