# Preprocessing

### Loading

In [2]:
import os
import numpy as np
import pandas as pd
from skimage import filters, color
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import rsatoolbox
from rsatoolbox.data import Dataset
from rsatoolbox.rdm import calc_rdm
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr, t
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
from cliffs_delta import cliffs_delta
from sklearn.preprocessing import normalize
from bisect import bisect_right, bisect_left
from scipy.spatial.distance import pdist

### Constants

In [None]:
models = ['uni2', 'virchow2', 'prov', 'conch', 'plip', 'keep', 'dinov2']
model_tags = ['UNI2', 'Virchow2', 'Prov-Gigapath', 'CONCH', 'PLIP', 'KEEP', 'ViT-Dinov2']
cancer_types = ['BRCA', 'COAD', 'LUAD', 'LUSC']

n_batches = 5
total_slides = 250
total_patches = 250

num_slides_per_batch = total_slides // n_batches
num_patches_per_batch = total_patches // n_batches


In [None]:
PROJECT_SAVE_DIR = '/lotterlab/users/vmishra/RSA_08282025/'

orig_embeddings_path = f"{PROJECT_SAVE_DIR}/embeddings/"
batched_embeddings_path = f"{PROJECT_SAVE_DIR}/embeddings-batched/"



In [None]:
normalized = False
norm_tag = '-normalized' if normalized else ''

plot_path = f"{PROJECT_SAVE_DIR}/plots{norm_tag}/"
rdm_path = f"{PROJECT_SAVE_DIR}/rdms{norm_tag}/"

In [None]:
for d in [batched_embeddings_path, rdm_path, plot_path]:
    os.makedirs(d, exist_ok=True)

### Splitting Into 5 Batches of 50 slides/50 patches each for validation

In [None]:
for cancer_type in cancer_types:
    for model in models:
        file_path = os.path.join(orig_embeddings_path, f"embeddings_{cancer_type}{norm_tag}-{model}.npy")
        print(f"Processing {file_path}...")

        embeddings = np.load(file_path)
        num_total = total_slides * total_patches
        embedding_dim = embeddings.shape[1]
        assert embeddings.shape[0] == num_total, f"Unexpected shape for {file_path}"

        embeddings = embeddings.reshape(total_slides, total_patches, embedding_dim)

        for batch_idx in range(n_batches):
            start_slide = batch_idx * num_slides_per_batch
            end_slide = (batch_idx + 1) * num_slides_per_batch

            batch = embeddings[start_slide:end_slide, :num_patches_per_batch, :]
            batch = batch.reshape(-1, embedding_dim)

            batch_filename = f"embeddings_{cancer_type}{norm_tag}-{model}-batch{batch_idx}.npy"
            batch_path = os.path.join(batched_embeddings_path, batch_filename)
            np.save(batch_path, batch)
            print(f"Saved batch {batch_idx} to {batch_path}")

# RDMS

In [None]:
for model in models:
    for batch_idx in range(n_batches):
        print(f"Processing RDM for model: {model}, batch: {batch_idx}")

        embeddings_list = []
        labels = []
        for cancer_type in cancer_types:
            file_path = os.path.join(batched_embeddings_path, f"embeddings_{cancer_type}{norm_tag}-{model}-batch{batch_idx}.npy")
            emb = np.load(file_path)
            embeddings_list.append(emb)
            labels.extend([cancer_type] * len(emb))

        embeddings = np.concatenate(embeddings_list, axis=0)

        dataset = Dataset(measurements=embeddings, obs_descriptors={'disease': labels})
        rdm = calc_rdm(dataset, method='euclidean')
        rdm_matrix = rdm.get_matrices()[0]

        save_path = os.path.join(rdm_path, f"rdm_matrix_{model}{norm_tag}_batch{batch_idx}.npy")
        np.save(save_path, rdm_matrix)
        print(f"Saved RDM to {save_path}")

In [None]:
def make_plot(rdm, model_name):
    n = rdm.shape[0] / 4
    divider_positions = [n, 2*n, 3*n]

    plt.figure(figsize=(10, 8))
    ax = sns.heatmap(rdm, cmap='Blues', annot=False, cbar=True)
    ax.collections[0].colorbar.set_label("Normalized Distance", fontsize=12)
    plt.title(model_name, fontweight='bold', fontsize=20)
    plt.xticks([n // 2, n + n // 2, n + n + n // 2, n + n + n + n // 2],
               cancer_types, rotation=0, ha='center', fontsize=12, fontweight='bold')
    plt.yticks([n // 2, n + n // 2, n + n + n // 2, n + n + n + n // 2],
               cancer_types, rotation=0, fontsize=12, fontweight='bold')

    # Add divider lines between disease types
    for pos in divider_positions:
        ax.axhline(pos, color='black', linewidth=1.5)
        ax.axvline(pos, color='black', linewidth=1.5)

    plt.savefig(plot_path + "rdm_matrix" + model_name + norm_tag + ".png", dpi=300, bbox_inches='tight')

for m, t in zip(models, model_tags):
    print(m)
    rdm_mat = np.load(os.path.join(rdm_path, f"rdm_matrix_{model}{norm_tag}_batch0.npy")) # just plot first batch
    rdm_mat = rdm_mat / rdm_mat.max()
    make_plot(rdm_mat, t)

# Spearman and Cosine Similarity Heatmap

In [None]:
n_models = len(models)

cosine_mean = np.zeros((n_models, n_models))
cosine_lower = np.zeros((n_models, n_models))
cosine_upper = np.zeros((n_models, n_models))

spearman_mean = np.zeros((n_models, n_models))
spearman_lower = np.zeros((n_models, n_models))
spearman_upper = np.zeros((n_models, n_models))

def get_range(values):
    return np.mean(values), np.min(values), np.max(values)

for i, model_i in enumerate(models):
    for j, model_j in enumerate(models):
        cosine_vals = []
        spearman_vals = []

        for batch in range(n_batches):
            path_i = os.path.join(rdm_path, f"rdm_matrix_{model_i}{norm_tag}_batch{batch}.npy")
            path_j = os.path.join(rdm_path, f"rdm_matrix_{model_j}{norm_tag}_batch{batch}.npy")
            rdm_i = np.load(path_i)
            rdm_j = np.load(path_j)

            tri_i = rdm_i[np.triu_indices_from(rdm_i, k=1)]
            tri_j = rdm_j[np.triu_indices_from(rdm_j, k=1)]

            cos_sim = 1 - cosine(tri_i, tri_j)
            spearman_corr, _ = spearmanr(tri_i, tri_j)

            cosine_vals.append(cos_sim)
            spearman_vals.append(spearman_corr)

        # Cosine
        mean_cos, cos_low, cos_high = get_range(cosine_vals)
        cosine_mean[i, j] = mean_cos
        cosine_lower[i, j] = cos_low
        cosine_upper[i, j] = cos_high

        # Spearman
        mean_spear, spear_low, spear_high = get_range(spearman_vals)
        spearman_mean[i, j] = mean_spear
        spearman_lower[i, j] = spear_low
        spearman_upper[i, j] = spear_high

def format_annot(mean_mat, lower_mat, upper_mat):
    n = mean_mat.shape[0]
    annot = np.empty((n, n), dtype=object)
    for i in range(n):
        for j in range(n):
            m = mean_mat[i, j]
            l = lower_mat[i, j]
            u = upper_mat[i, j]
            annot[i, j] = f"{m:.3f}\n[{l:.3f}â€“{u:.3f}]"
    return annot

cosine_annot = format_annot(cosine_mean, cosine_lower, cosine_upper)
spearman_annot = format_annot(spearman_mean, spearman_lower, spearman_upper)

# Plot cosine similarity
plt.figure(figsize=(10, 8))
sns.heatmap(cosine_mean, annot=cosine_annot, fmt="", xticklabels=model_tags, yticklabels=model_tags,
            cmap="Reds", vmin=0.85, vmax=1, cbar_kws={"label": "Cosine Similarity"})
plt.title("Cosine Similarity Between Model RDMs\nMean [Range Across 5 Batches]")
plt.tight_layout()
plt.savefig(plot_path + f"cosineHeatmap{norm_tag}.pdf", format="pdf", bbox_inches="tight")
plt.show()

# Plot Spearman correlation
plt.figure(figsize=(10, 8))
sns.heatmap(spearman_mean, annot=spearman_annot, fmt="", xticklabels=model_tags, yticklabels=model_tags,
            cmap="Reds", vmin=0, vmax=1, cbar_kws={"label": "Spearman Correlation"})
plt.title("Spearman Correlation Between Model RDMs\nMean [Range Across 5 Batches]")
plt.tight_layout()
plt.savefig(plot_path + f"spearmanHeatmap{norm_tag}.pdf", format="pdf", bbox_inches="tight")
plt.show()

# Mean Spearman correlation

In [None]:
n_models = len(models)

average_spearman_corr = (np.sum(spearman_mean, axis=1) - np.diag(spearman_mean)) / (n_models - 1)

df_similarity = pd.DataFrame({
    "Model": model_tags,
    "Average Spearman Correlation": average_spearman_corr
})

df_similarity

In [None]:
average_cos = (np.sum(cosine_mean, axis=1) - np.diag(cosine_mean)) / (n_models - 1)

df_similarity = pd.DataFrame({
    "Model": model_tags,
    "Average Cosine Correlation": average_cos
})

df_similarity

# Dendrograms (Change when above values are run)

In [None]:
n_models = len(models)
spearman_mean = np.ones((n_models, n_models))

spearman_mean[0, 1:] = [0.279, 0.41, 0.4, 0.223, 0.195, 0.122]
spearman_mean[1, 2:] = [0.466, 0.391, 0.43, 0.371, 0.136]
spearman_mean[2, 3:] = [0.505, 0.54, 0.515, 0.181]
spearman_mean[3, 4:] = [0.485, 0.384, 0.151]
spearman_mean[4, 5:] = [0.734, 0.222]
spearman_mean[5, 6] = 0.192

for i in range(1, n_models):
    for j in range(i):
        spearman_mean[i, j] = spearman_mean[j, i]

In [None]:
spearman_mean

In [None]:
spearman_distances = 1 - spearman_mean

# Convert to condensed form
spearman_condensed = squareform(spearman_distances, checks=False)

# Hierarchical Clustering (Ward's method)
linkage_spearman = linkage(spearman_condensed, method="ward")

In [None]:
thresholds = [0.1, 0.33, 0.55, 0.62, 0.70, 0.81, 1.00]

# Get Seaborn color palette with one more color than thresholds
colors = sns.color_palette("deep", len(thresholds) + 1)
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'gray']

# Custom link color function based on thresholds
def link_color_func(link_id):
    print(link_id)
    #print(link_id)
#     if link_id < linkage_spearman.shape[0]:
#         print(link_id)
    row = link_id - linkage_spearman.shape[0] - 1
    dist = linkage_spearman[row, 2]  # height of the node
    for i, t in enumerate(thresholds):
        if dist < t:
            return colors[i]
    return colors[-1]
#     else:
#         return 'black'

In [None]:
fig, ax = plt.subplots()

#plt.figure(figsize=(10, 5))
ct = [1, ]
dendrogram(linkage_spearman, labels=models, leaf_rotation=90, leaf_font_size=12, color_threshold=0.82) #, link_color_func=link_color_func)
plt.title("Hierarchical Clustering of Model RDM Similarity", fontweight='bold')
#plt.xlabel("Models")
plt.ylabel("Distance")
plt.tight_layout()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.ylabel('Ward Distance', fontsize=12) #, fontweight='bold')

plt.savefig(plot_path + f'ClusteringSpearman{norm_tag}.pdf')

In [None]:
# just copied from plot since taking so long
n_models = len(models)
cosine_mean = np.ones((n_models, n_models))

cosine_mean[0, 1:] = [0.939, 0.966, 0.946, 0.924, 0.922, 0.883]
cosine_mean[1, 2:] = [0.964, 0.942, 0.936, 0.933, 0.879]
cosine_mean[2, 3:] = [0.963, 0.957, 0.958, 0.905]
cosine_mean[3, 4:] = [0.947, 0.937, 0.88]
cosine_mean[4, 5:] = [0.972, 0.882]
cosine_mean[5, 6] = 0.878

for i in range(1, n_models):
    for j in range(i):
        cosine_mean[i, j] = cosine_mean[j, i]

In [None]:
cosine_distances = 1 - cosine_mean

# Convert to condensed form
cosine_condensed = squareform(cosine_distances, checks=False)

# Hierarchical Clustering (Ward's method)
linkage_cosine = linkage(cosine_condensed, method="ward")

In [None]:
fig, ax = plt.subplots()

#plt.figure(figsize=(10, 5))
ct = [1, ]
dendrogram(linkage_cosine, labels=models, leaf_rotation=90, leaf_font_size=12, color_threshold=0.14) #, link_color_func=link_color_func)
plt.title("Hierarchical Clustering of Model RDM Similarity", fontweight='bold')
#plt.xlabel("Models")
plt.ylabel("Distance")
plt.tight_layout()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.ylabel('Ward Distance', fontsize=12) #, fontweight='bold')

plt.savefig(plot_path + f'ClusteringCosine{norm_tag}.pdf')

# Slide Specificity

In [None]:
def efficient_cliffs_delta(a, b):
    a = np.sort(a)
    b = np.sort(b)
    m, n = len(a), len(b)
    more = sum(bisect_right(b, x) for x in a)
    less = sum(n - bisect_left(b, x) for x in a)
    delta = (more - less) / (m * n)
    return delta

In [None]:
def calculate_distances_per_slide(embeddings, num_slides, patches_per_slide):
    intra_distances = []
    inter_distances = []

    for slide_idx in range(num_slides):
        slide_indices = np.arange(slide_idx * patches_per_slide, (slide_idx + 1) * patches_per_slide)
        other_indices = np.setdiff1d(np.arange(embeddings.shape[0]), slide_indices)

        slide_distances = pdist(embeddings[slide_indices], metric='euclidean')
        intra_distances.extend(slide_distances)

        for other_idx in other_indices:
            inter_distances.extend(np.linalg.norm(embeddings[slide_indices] - embeddings[other_idx], axis=1))

    return np.array(intra_distances), np.array(inter_distances)


results = []
for model in tqdm(models, desc="Processing models"):
    delta_vals = []

    for batch_idx in range(n_batches):
        embeddings_list = []
        for cancer_type in cancer_types:
            batch_filename = f"embeddings_{cancer_type}{norm_tag}-{model}-batch{batch_idx}.npy"
            batch_path = os.path.join(batched_embeddings_path, batch_filename)
            emb = np.load(batch_path)
            embeddings_list.append(emb)

        all_embeddings = np.vstack(embeddings_list)
        intra, inter = calculate_distances_per_slide(all_embeddings, num_slides=num_slides_per_batch * len(cancer_types), patches_per_slide=num_patches_per_batch)

        delta_vals.append(efficient_cliffs_delta(intra, inter))

    delta_vals = np.array(delta_vals)

    results.append({
        "model": model,
        "cliffs_delta_mean": delta_vals.mean(),
        "cliffs_delta_min": delta_vals.min(),
        "cliffs_delta_max": delta_vals.max()
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="cliffs_delta_mean", ascending=False).reset_index(drop=True)
results_df.to_csv(plot_path + f"slide_specificity{norm_tag}.csv", index=False)
print(results_df)

# Disease Specificity

In [None]:
results = []
for model in tqdm(models, desc="Processing models"):
    delta_vals = []

    for batch_idx in range(n_batches):
        disease_embeddings = {}
        for cancer_type in cancer_types:
            batch_filename = f"embeddings_{cancer_type}{norm_tag}-{model}-batch{batch_idx}.npy"
            batch_path = os.path.join(batched_embeddings_path, batch_filename)
            emb = np.load(batch_path)
            disease_embeddings[cancer_type] = emb

        intra_distances = []
        for cancer_type in cancer_types:
            emb = disease_embeddings[cancer_type]            
            for slide_i in range(num_slides_per_batch-1):
                for slide_j in range(slide_i + 1, num_slides_per_batch):
                    start_i = slide_i * num_patches_per_batch
                    end_i = (slide_i + 1) * num_patches_per_batch
                    patches_i = emb[start_i:end_i]
                    
                    start_j = slide_j * num_patches_per_batch
                    end_j = (slide_j + 1) * num_patches_per_batch
                    patches_j = emb[start_j:end_j]
                    
                    diffs = np.linalg.norm(patches_i[:, None, :] - patches_j[None, :, :], axis=2).flatten()
                    intra_distances.extend(diffs)

        inter_distances = []
        for i in range(len(cancer_types)-1):
            for j in range(i + 1, len(cancer_types)):
                emb1 = disease_embeddings[cancer_types[i]]
                emb2 = disease_embeddings[cancer_types[j]]
                diffs = np.linalg.norm(emb1[:, None, :] - emb2[None, :, :], axis=2).flatten()
                inter_distances.extend(diffs)

        intra = np.array(intra_distances)
        inter = np.array(inter_distances)

        delta_vals.append(efficient_cliffs_delta(intra, inter))

    delta_vals = np.array(delta_vals)

    results.append({
        "model": model,
        "cliffs_delta_mean": delta_vals.mean(),
        "cliffs_delta_min": delta_vals.min(),
        "cliffs_delta_max": delta_vals.max()
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="cliffs_delta_mean", ascending=False).reset_index(drop=True)
results_df.to_csv(plot_path + f"disease_specificity_exclude_same_slide{norm_tag}.csv", index=False)
print(results_df)

# Spectral Analysis

In [None]:
embeddings = {}
for model in models:
    embeddings_list = []
    print(f"\nProcessing {model}...")
    for cancer_type in cancer_types:
        file_path = os.path.join(orig_embeddings_path, f"embeddings_{cancer_type}{norm_tag}-{model}.npy")
        emb = np.load(file_path)
        embeddings_list.append(emb)
    embeddings[model] = np.concatenate(embeddings_list, axis=0)
    embeddings[model] -= embeddings[model].mean(axis=0)

spectra = {}
for model in models:
    print(model)
    U, S, Vt = np.linalg.svd(embeddings[model], full_matrices=False)
    normalized_spectrum = S / S.sum()
    spectra[model] = normalized_spectrum

for model in models:
    print(model, embeddings[model].shape)

vl_models = ['keep', 'conch', 'plip']
p_test = [0.25] + list(range(1, 101))
s_sums = np.zeros((len(models), len(p_test)))

for i, m in enumerate(models):
    n_feat = embeddings[m].shape[-1]
    for j, p in enumerate(p_test):
        cut_off = int(np.round(n_feat * p/100))
        s_sums[i, j] = spectra[m][:(cut_off+1)].sum()

fig, ax = plt.subplots()
for i, m in enumerate(models):
    line_style = '--' if m in vl_models else '-'
    plt.plot(p_test, s_sums[i], label=model_tag[i], linewidth=2, linestyle=line_style)

plt.legend(prop={'weight': 'bold', 'size': 11})
ax.set_ylim([0, 1.02])
ax.set_aspect(100, adjustable='box')
plt.xlabel('Percentage of Features', fontsize=12, fontweight='bold')
plt.ylabel('Singular Value Cumulative Sum', fontsize=12, fontweight='bold')
#plt.title('SVD Spectral Analysis')

# Remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.savefig(plot_path + f'spectral_analysis{norm_tag}.pdf', dpi=300, bbox_inches='tight', pad_inches=0.05)