In [None]:
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap

from histopatseg.visualization.visualization import plot_embeddings
from histopatseg.evaluation.utils import aggregate_tile_embeddings

In [None]:
project_dir = Path(".").resolve().parent
print(f"Project Directory: {project_dir}")

In [None]:
embedding_file = project_dir / "data/processed/embeddings/lunghist700_20x_UNI2_centercrop_embeddings.npz"
metadata  = pd.read_csv(project_dir / "data/processed/LungHist700/metadata.csv").set_index("filename")
metadata.head()

In [None]:
# Load the embeddings
data = np.load(embedding_file)
embeddings = data["embeddings"]
image_ids = data["tile_ids"]
embedding_dim = data["embedding_dim"]

# Print basic information
print(f"Loaded {len(embeddings)} embeddings with dimensionality {embeddings.shape[1]}")
print(f"Embedding dimension from model: {embedding_dim}")

In [None]:
# Check if all embedding tile_ids are in the metadata index
missing_ids = [id for id in image_ids if id not in metadata.index]
if missing_ids:
    print(f"Warning: {len(missing_ids)} tile_ids from embeddings are not in metadata")
    print(f"First few missing IDs: {missing_ids[:5]}")
aligned_metadata = metadata.reindex(image_ids)
aligned_metadata['subclass'] = aligned_metadata.apply(
    lambda row: row['superclass'] if pd.isna(row['subclass']) and row['superclass'] == 'nor' else row['subclass'], 
    axis=1
)

In [None]:
aligned_metadata.head()

In [None]:
def visualize_embeddings(embeddings, metadata, method="t-SNE"):
    """Generate visualizations for embeddings using specified dimensionality reduction.
    
    Args:
        embeddings: The embedding vectors
        metadata: Associated metadata
        method: Dimensionality reduction method ("t-SNE", "UMAP", or "PCA")
    """
    suffix = "with center crop aggregation" 
    
    # Perform dimensionality reduction
    if method == "t-SNE":
        reducer = TSNE(
            n_components=2,
            perplexity=15, 
            n_iter=1000,
            random_state=42,
            init='pca'
        )
    elif method == "UMAP":
        reducer = umap.UMAP(
            n_neighbors=12,
            min_dist=0.2,
            n_components=2,
            metric='euclidean',
            random_state=42
        )
    elif method == "PCA":
        reducer = PCA(n_components=2, random_state=42)
    
    reduced_data = reducer.fit_transform(embeddings)
    
    # Plot with different colorings
    for color_by in ['class_name', 'superclass', 'subclass', 'resolution']:
        fig = plot_embeddings(
            reduced_data=reduced_data,
            metadata=metadata,
            color_by=color_by,
            method_name=method,
            title=f'{method} Projection of LungHist700 Embeddings {suffix}',
            palette_name='tab10'
        )
        plt.show()
    
    return reduced_data

In [None]:
tsne_embedding = visualize_embeddings(embeddings, aligned_metadata, "t-SNE")

In [None]:
umap_embedding = visualize_embeddings(embeddings, aligned_metadata, "UMAP")

In [None]:
pca_embedding = visualize_embeddings(embeddings, aligned_metadata, "PCA")