In [None]:
import torch

import cuml
import numpy as np
import pandas as pd
import scipy.stats

import os
import pysam
import joblib
import seaborn as sns
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import matplotlib.patches

# dataset
profile = pd.read_csv("data/stanford/profile.txt")
# Isolate string (i.e. su001) -> int (i.e. 1)
profile["Isolate"] = profile["Isolate"].apply(lambda x: int(x[2:]))
# Treatment pre/post -> 0/1
profile["Treatment"] = profile["Treatment"].apply(lambda x: int(not "pre" in x))
# Sort by Isolate (1 to 8), Treatment (pre to post), and Tissue (normal to BCC)
profile = profile.sort_values(
    by=["Isolate", "Treatment", "Tissue"], ascending=[True, True, False]
).reset_index(drop=True)
# only keep run, isolate, treatment, tissue
profile = profile[["Run", "Isolate", "Treatment", "Tissue"]]

In [None]:
""" UMAP Embedding """

embedding_umap = {}
embedding_umap_fold = os.path.join(
    data_fold_gpu, f"umap{umap_ckpt}/chr{chr}pval{int(-np.log10(pval_thresh))}"
)
if not os.path.exists(embedding_umap_fold):
    os.makedirs(embedding_umap_fold)
for run in tqdm.tqdm(profile["Run"], smoothing=0.0, unit="run"):
    embedding_umap_save_path = os.path.join(embedding_umap_fold, f"{run}.csv")
    if os.path.exists(embedding_umap_save_path):
        embedding_umap[run] = np.loadtxt(embedding_umap_save_path, delimiter=",")
    else:
        embedding_umap[run] = reducer.transform(embedding_dnabert2[run])
        np.savetxt(embedding_umap_save_path, embedding_umap[run], delimiter=",")

In [None]:
""" hexbin map for all sample """

fig, axs = plt.subplots(
    4, 6, figsize=(30, 20), sharex=True, sharey=True, dpi=500
)
index = -1
for i, ax in enumerate(axs.flat):
    if i == 11:
        ax.axis("off")
        continue
    index += 1

    ax.hexbin(
        embedding_umap[profile["Run"][index]][:, 0], 
        embedding_umap[profile["Run"][index]][:, 1], 
        gridsize=150,
        cmap="Reds",
        vmin=2, vmax=80,
    )

    for spine in ax.spines.values():
        spine.set_color("r" if profile["Treatment"][index] == 0 else "b")
        spine.set_linewidth(2)

    ax.set_aspect("equal")
    ax.set_xlim(-16, 16)
    ax.set_ylim(-16, 16)
    ax.set_title(
        "Run: {}; ".format(profile["Run"][index]) + 
        "Isolate: {};".format(profile["Isolate"][index])
    )
fig.legend(
    handles=[
        matplotlib.patches.Patch(color="r", label="pre "), 
        matplotlib.patches.Patch(color="b", label="post")
    ], loc="upper right", ncol=2, fontsize=14,
)
fig.suptitle(
    f"Stanford Data; chromosome {chr}; SNPs p-val threshold (<=) {pval_thresh}", 
    fontweight='bold', y=0.992, fontsize=16
)
fig.supxlabel("UMAP1", fontweight='bold', y=0.005, fontsize=16)
fig.supylabel("UMAP2", fontweight='bold', x=0.010, fontsize=16)
fig.tight_layout()
fig.savefig(
    os.path.join(embedding_umap_fold, "hexbin.png"), 
    dpi=500,
)
plt.close()

In [None]:
""" two 1D distribution map """

x = np.linspace(-30, 30, 500)

for u in [0, 1]:
    base = scipy.stats.gaussian_kde(
        np.concatenate([embedding_umap[run][:, u] for run in profile["Run"]])
    )(x)
    fig, axs = plt.subplots(1, 8, figsize=(24, 6), sharex=True, sharey=True, dpi=500)
    for i in profile["Isolate"].unique():
        ax = axs.flat[i-1]
        for run in profile[profile["Isolate"] == i]["Run"]:
            density = scipy.stats.gaussian_kde(embedding_umap[run][:, u])(x)
            ax.plot(
                x, density - base, 
                c='r' if profile[profile["Run"]==run]["Treatment"].values == 0 else 'b'
            )
            ax.set_xlim(-25, 25)
            ax.set_ylim(-0.008, 0.008)
            ax.set_title(f"Isolate: {i}")
    fig.legend(
        handles=[
            matplotlib.patches.Patch(color="r", label="pre "), 
            matplotlib.patches.Patch(color="b", label="post")
        ], loc="upper right", ncol=2, fontsize=14,
    )
    fig.suptitle(
        f"Stanford Data; chromosome {chr}; SNPs p-val threshold (<=) {pval_thresh}", 
        fontweight='bold', y=0.992, fontsize=16
    )
    fig.supxlabel(f"UMAP{u+1}", fontweight='bold', y=0.005, fontsize=16)
    fig.supylabel("Density Different", fontweight='bold', x=0.010, fontsize=16)
    fig.tight_layout()
    fig.savefig(
        os.path.join(embedding_umap_fold, f"distribution{u+1}.png"), 
        dpi=500,
    )
    plt.close()