In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from histopatseg.constants import CLASS_MAPPING, SUPERCLASS_MAPPING

load_dotenv()


In [None]:
magnification = 20
data_path = Path(os.getenv("LUNGHIST700_PATH"))
metadata = pd.read_csv(
    data_path /
    f"LungHist700_{magnification}x/metadata.csv").set_index("tile_id").drop(columns=["image_id"])

embeddings_path = Path(
    f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/UNI2_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/convnext_large_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/bioptimus_{magnification}x.npz"
)

In [None]:
with np.load(embeddings_path) as data:
    embeddings = data["embeddings"]
    tile_ids = data["tile_ids"]
    embedding_dim = data["embedding_dim"]


In [None]:
metadata

In [None]:
embeddings_df = pd.DataFrame(embeddings, index=tile_ids)
embeddings_df.head()

In [None]:
embeddings.shape

In [None]:
labels = metadata.loc[tile_ids, "class_name"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])

In [None]:
# Fit UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(embeddings)

In [None]:

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
metadata.head()

In [None]:
def aggregate_embeddings(embeddings, tile_ids, metadata):

    df = pd.DataFrame(embeddings)
    df["image_id"] = metadata.loc[tile_ids]["original_filename"].to_list()

    aggregated_df = df.groupby("image_id").mean()
    image_ids = list(aggregated_df.index)

    grouped_metadata = metadata.groupby("original_filename").agg(
        {key: "first"
         for key in metadata.columns})

    return aggregated_df, grouped_metadata.loc[image_ids]

In [None]:
aggregated_embeddings_df, aggregated_metadata = aggregate_embeddings(embeddings, tile_ids, metadata)

In [None]:
aggregated_embeddings_df.head()

In [None]:
aggregated_metadata.head()

In [None]:

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(aggregated_embeddings_df)
labels = aggregated_metadata["class_name"].values
magnifications = aggregated_metadata["resolution"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])
superclasses = aggregated_metadata["superclass"].values
superclasses_mapped = np.array([SUPERCLASS_MAPPING[s] for s in superclasses])

In [None]:

plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
# plt.scatter(X_umap[:, 0], X_umap[:, 1], c=magnifications, hue='category', palette='tab10')
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
umap_df = pd.DataFrame({
    'x': X_umap[:, 0],
    'y': X_umap[:, 1],
    'magnification': magnifications,
    'label': labels,  # optional if you want to use class_name instead
    'superclass': superclasses  # optional
})

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=umap_df,
    x='x',
    y='y',
    hue='magnification',  # categorical column here
    palette='tab10',
    s=10
)
plt.title("UMAP projection colored by magnification")
plt.legend(title="Magnification", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=superclasses_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
aggregated_embeddings_df_luad = aggregated_embeddings_df.loc[aggregated_metadata[aggregated_metadata["superclass"] == "aca"].index]
labels = aggregated_metadata[aggregated_metadata["superclass"] == "aca"]["class_name"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])

In [None]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(aggregated_embeddings_df_luad)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
aggregated_embeddings_df_luad.shape