In [1]:
import os
from pathlib import Path

from dotenv import load_dotenv
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt

from histopatseg.constants import CLASS_MAPPING, SUPERCLASS_MAPPING

load_dotenv()


True

In [2]:
magnification = 20
data_path = Path(os.getenv("LUNGHIST700_PATH"))
metadata = pd.read_csv(
    data_path /
    f"LungHist700_{magnification}x/metadata.csv").set_index("tile_id").drop(columns=["image_id"])

embeddings_path = Path(
    f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/UNI2_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/convnext_large_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/bioptimus_{magnification}x.npz"
)

In [3]:
with np.load(embeddings_path) as data:
    embeddings = data["embeddings"]
    tile_ids = data["tile_ids"]
    embedding_dim = data["embedding_dim"]


In [5]:
metadata

Unnamed: 0_level_0,patient_id,superclass,subclass,resolution,class_name,label,original_filename,tile_path
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
scc_bd_20x_39_tile_0_1,25,scc,bd,20x,scc_bd,4,scc_bd_20x_39,/home/valentin/workspaces/histolung/data/proce...
scc_bd_20x_69_tile_7_0,27,scc,bd,20x,scc_bd,4,scc_bd_20x_69,/home/valentin/workspaces/histolung/data/proce...
scc_pd_40x_43_tile_2_0,34,scc,pd,40x,scc_pd,6,scc_pd_40x_43,/home/valentin/workspaces/histolung/data/proce...
scc_bd_20x_63_tile_1_1,27,scc,bd,20x,scc_bd,4,scc_bd_20x_63,/home/valentin/workspaces/histolung/data/proce...
aca_md_20x_9_tile_7_3,16,aca,md,20x,aca_md,1,aca_md_20x_9,/home/valentin/workspaces/histolung/data/proce...
...,...,...,...,...,...,...,...,...
aca_md_20x_403_tile_3_5,13,aca,md,20x,aca_md,1,aca_md_20x_403,/home/valentin/workspaces/histolung/data/proce...
aca_pd_40x_33_tile_3_2,8,aca,pd,40x,aca_pd,2,aca_pd_40x_33,/home/valentin/workspaces/histolung/data/proce...
nor_20x_902_tile_2_3,45,nor,,20x,nor,3,nor_20x_902,/home/valentin/workspaces/histolung/data/proce...
nor_20x_23_tile_3_3,41,nor,,20x,nor,3,nor_20x_23,/home/valentin/workspaces/histolung/data/proce...


In [4]:
embeddings_df = pd.DataFrame(embeddings, index=tile_ids)
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
nor_20x_41_tile_3_3,-0.043811,0.157218,0.32455,-0.503031,-0.279342,-0.717521,-0.378455,-0.634131,-0.017727,-0.151263,...,-0.329754,0.235133,-0.075796,-0.771311,-0.115314,-0.514299,-0.055734,0.628524,-0.096684,0.083973
scc_pd_20x_54_tile_3_2,0.680211,0.165389,-0.136717,-0.643031,-0.221158,-0.016323,-0.540934,-1.078925,-0.739775,0.224995,...,0.112281,-0.6081,-0.280136,-0.00847,0.248563,-0.702575,-0.410934,-0.254698,-0.39112,0.839498
aca_md_20x_7_tile_6_5,0.126063,-0.060346,-0.375732,-0.412807,0.760599,-0.230769,-0.674735,-1.439138,0.131721,-0.572236,...,-0.063849,-0.23954,0.175412,-0.000613,-0.068637,0.110751,-0.320127,-0.159741,0.508617,0.186672
scc_bd_20x_83_tile_4_1,0.148138,0.049619,-0.229554,-0.371636,-0.305304,-0.557771,0.020372,-1.067832,-0.730167,-0.190244,...,0.176634,0.009101,0.236821,0.092313,-0.495667,0.415165,-0.92,0.179914,0.077188,0.063047
scc_pd_20x_2_tile_6_0,-0.682194,-0.776183,0.425574,-0.515087,0.345074,0.132616,-0.173323,-0.836864,0.428494,0.035089,...,-0.298749,0.196515,1.299119,0.094202,-0.700704,0.242442,0.239477,-0.244693,-0.379865,0.248738


In [None]:
embeddings.shape

In [None]:
labels = metadata.loc[tile_ids, "class_name"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])

In [None]:
# Fit UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(embeddings)

In [None]:

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
metadata.head()

In [None]:
def aggregate_embeddings(embeddings, tile_ids, metadata):

    df = pd.DataFrame(embeddings)
    df["image_id"] = metadata.loc[tile_ids]["original_filename"].to_list()

    aggregated_df = df.groupby("image_id").mean()
    image_ids = list(aggregated_df.index)

    grouped_metadata = metadata.groupby("original_filename").agg(
        {key: "first"
         for key in metadata.columns})

    return aggregated_df, grouped_metadata.loc[image_ids]

In [None]:
aggregated_embeddings_df, aggregated_metadata = aggregate_embeddings(embeddings, tile_ids, metadata)

In [None]:
aggregated_embeddings_df.head()

In [None]:
aggregated_metadata.head()

In [None]:

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(aggregated_embeddings_df)
labels = aggregated_metadata["class_name"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])
superclasses = aggregated_metadata["superclass"].values
superclasses_mapped = np.array([SUPERCLASS_MAPPING[s] for s in superclasses])

In [None]:

plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=superclasses_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
aggregated_embeddings_df_luad = aggregated_embeddings_df.loc[aggregated_metadata[aggregated_metadata["superclass"] == "aca"].index]
labels = aggregated_metadata[aggregated_metadata["superclass"] == "aca"]["class_name"].values
labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])

In [None]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_umap = reducer.fit_transform(aggregated_embeddings_df_luad)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels_mapped, cmap='Spectral', s=10)
plt.colorbar()
plt.title("UMAP projection of the Digits dataset")
plt.show()

In [None]:
aggregated_embeddings_df_luad.shape