In [None]:
from vislearnlabpy.embeddings.embedding_store import EmbeddingStore
from pathlib import Path
embedding_dir = Path("../data/embeddings/")
cdm_store = EmbeddingStore.from_doc(str(embedding_dir / "cdm_store"))
kisumu_store = EmbeddingStore.from_doc(str(embedding_dir / "kisumu_store"))
beijing_store = EmbeddingStore.from_doc(str(embedding_dir / "beijing_store"))

In [3]:
# example remapping back to url if needed
#beijing_store.EmbeddingList.url = list(map(lambda p: f"{file_dir}/{p}", store.EmbeddingList.url))

Plotting RDMs

In [4]:
rdm_categories = ["airplane", "bike", "car", "bird", "cat", "rabbit", "tree", "house", "chair", "cup", "hat", "watch"]
sorted_categories = sorted(rdm_categories)

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from vislearnlabpy.embeddings.similarity_utils import correlate_rdms

def plot_heatmaps(matrices, categories, titles, corr_values=None, cmap="viridis", vmin=None, vmax=None, figsize=(18,6), center=None, cbar=True, suptitle=None):
    n = len(matrices)
    fig, axs = plt.subplots(1, n, figsize=figsize)
    if n == 1:
        axs = [axs]
    if vmin is None:
        vmin = min(m.min() for m in matrices)
    if vmax is None:
        vmax = max(m.max() for m in matrices)
    
    for i, (mat, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(mat, ax=axs[i], xticklabels=categories, yticklabels=categories,
                    cmap=cmap, vmin=vmin, vmax=vmax, center=center, cbar=cbar if i == n-1 else False)
        corr_text = f"\nMean diagonal similarity: {corr_values[i]:.3f}" if corr_values else ""
        axs[i].set_title(title + corr_text)
        if " vs " in title:
            x_label, y_label = title.split(" vs ")
            axs[i].set_xlabel(x_label.strip())
            axs[i].set_ylabel(y_label.strip())
    if suptitle is not None:
        fig.suptitle(suptitle, 
        fontsize=16
        )
    plt.tight_layout()
    plt.show()
    return fig

In [None]:
from vislearnlabpy.embeddings.similarity_utils import correlate_rdms

# Compute the RDMs 
cdm_rdm = cdm_store.compute_text_rdm()
beijing_rdm = beijing_store.compute_text_rdm()
kisumu_rdm = kisumu_store.compute_text_rdm()

# change ordering to create clusters: TODO: support custom ordering for our rdm functions
reorder_idx = [sorted_categories.index(cat) for cat in rdm_categories]
cdm_rdm, beijing_rdm, kisumu_rdm = [
    rdm[np.ix_(reorder_idx, reorder_idx)] 
    for rdm in [cdm_rdm, beijing_rdm, kisumu_rdm]
]

# Compute correlations
corr_beijing_kisumu = correlate_rdms(beijing_rdm, kisumu_rdm)
corr_beijing_cdm = correlate_rdms(beijing_rdm, cdm_rdm)
corr_kisumu_cdm = correlate_rdms(kisumu_rdm, cdm_rdm)

fig = plot_heatmaps(
    [beijing_rdm, kisumu_rdm, cdm_rdm],
    categories=rdm_categories,
    titles=["Beijing RDM", "Kisumu RDM", "San Jose RDM"],
    corr_values=None,
    cmap="viridis",
    figsize=(17,6),
    cbar=True,
    suptitle= f"RDM Pearson's correlations: darker colors indicate higher cosine similarity.\nBeijing-Kisumu R={corr_beijing_kisumu:.3f} | "
    f"Beijing-San Jose R={corr_beijing_cdm:.3f} | Kisumu-San Jose R={corr_kisumu_cdm:.3f}"
)
fig.savefig("../data/figures/rdm_comparison.png")

In [None]:
from vislearnlabpy.embeddings.similarity_utils import cosine_matrix

def get_mean_embeddings(embedding_list, categories):
    means = []
    for cat in categories:
        embeddings = [emb.embedding for emb in embedding_list if emb.text == cat]
        if not embeddings:
            means.append(np.zeros_like(embedding_list[0].embedding))  # or np.nan
        else:
            means.append(np.mean(embeddings, axis=0))
    return np.stack(means)

# Compute mean embeddings per category
kisumu_means = get_mean_embeddings(kisumu_store.EmbeddingList, rdm_categories)
beijing_means = get_mean_embeddings(beijing_store.EmbeddingList, rdm_categories)
cdm_means = get_mean_embeddings(cdm_store.EmbeddingList, rdm_categories)

# Compute similarity matrices (cosine similarity)
sim_kisumu_beijing = 1 - cosine_matrix(kisumu_means, beijing_means)
sim_kisumu_cdm = 1 - cosine_matrix(kisumu_means, cdm_means)
sim_cdm_beijing = 1 - cosine_matrix(cdm_means, beijing_means)

mean_diags = [1 - np.mean(np.diag(m)) for m in [sim_kisumu_beijing, sim_kisumu_cdm, sim_cdm_beijing]]

fig = plot_heatmaps(
    [sim_kisumu_beijing, sim_kisumu_cdm, sim_cdm_beijing],
    categories=rdm_categories,
    titles=["Kisumu vs Beijing", "Kisumu vs San Jose", "San Jose vs Beijing"],
    suptitle="Heatmap of cosine similarity of mean category embeddings between locations",
    corr_values=mean_diags,
    cmap="magma",
    center=0,
    figsize=(20,6),
    cbar=True
)
fig.savefig("../data/figures/within_rdm_comparison.png")