## Running Leiden Clustering ##

In [1]:
import numpy as np
import pandas as pd
import ast
import umap
import math
import os
import scanpy as sc
import anndata as ad

2025-04-29 15:12:51.435663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745953971.461996 2977506 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745953971.471030 2977506 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-29 15:12:51.503026: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Helper functions
def load_data(metadata_file, embedding_file, filenames_file):
    """
    Load metadata, embeddings, and filenames and merge them based on filepaths.
    
    Parameters:
    - metadata_file: Path to the CSV file containing metadata.
    - embedding_file: Path to the .npy file containing the embeddings.
    - filenames_file: Path to the .npy file containing the corresponding filenames.
    
    Returns:
    - Merged DataFrame with embeddings and metadata.
    """
    print("Loading metadata and embeddings...")
    # Load metadata
    metadata = pd.read_csv(metadata_file)
    metadata['filepath'] = metadata.apply(
        lambda row: f"/gpfs/scratch/yb2612/dl4med_25/dl_project/data/scratch_data/{row['original_set']}/{row['slides']}/{row['tiles']}",
        axis=1
    )
    
    # Replace 'valid' with 'val' in the 'filepath' column
    metadata['filepath'] = metadata['filepath'].str.replace('valid', 'val')

    # Load embeddings and filenames
    embeddings = np.load(embedding_file, allow_pickle=True)
    filepaths = np.load(filenames_file, allow_pickle=True)

    # Convert embeddings to a DataFrame
    img_z_latent = [emb for emb in embeddings]
    embedding_df = pd.DataFrame({
        "filepath": filepaths,
        "img_z_latent": img_z_latent
    })
    
    # Merge embeddings with metadata
    merged_df = metadata.merge(embedding_df, on="filepath", how="inner")
    return merged_df

def run_umap(merged_df, n_neighbors=30, min_dist=0.0, n_components=2, random_state=42):
    """
    Perform UMAP transformation on the img_z_latent column of the merged dataframe.
    
    Parameters:
    - merged_df: DataFrame with the img_z_latent column.
    - n_neighbors, min_dist, n_components, random_state: UMAP hyperparameters.
    
    Returns:
    - DataFrame with UMAP results added.
    """
    # Clean 'img_z_latent' column
    merged_df['img_z_latent'] = merged_df['img_z_latent'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    img_z_latent = pd.DataFrame(merged_df['img_z_latent'].to_list())

    # Perform UMAP
    print("Running UMAP...")
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=random_state, low_memory=True)
    umap_result = umap_model.fit_transform(img_z_latent)

    # Add UMAP results to DataFrame
    merged_df['umap_1'] = umap_result[:, 0]
    merged_df['umap_2'] = umap_result[:, 1]
    
    return merged_df
    
def run_leiden(merged_df, resolution=2.0):
    """
    Run Leiden clustering using Scanpy on img_z_latent and append the labels to the dataframe.
    
    Parameters:
    - merged_df: DataFrame with the img_z_latent column.
    - resolution: Resolution parameter for the Leiden algorithm.
    
    Returns:
    - DataFrame with a new column 'leiden_{resolution}' for clustering labels.
    """
    print("Running Leiden clustering...")
    
    # Prepare latent embedding matrix
    merged_df['img_z_latent'] = merged_df['img_z_latent'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    X = np.vstack(merged_df['img_z_latent'].to_numpy())

    # Create AnnData object
    adata = ad.AnnData(X)

    # Build neighborhood graph
    sc.pp.neighbors(adata, use_rep='X', n_neighbors=250, method='umap')

    # Run Leiden clustering
    sc.tl.leiden(adata, resolution=resolution, key_added=f'leiden_{resolution}')

    # Append clustering labels to original dataframe
    merged_df[f'leiden_{resolution}'] = adata.obs[f'leiden_{resolution}'].values

    return merged_df

In [4]:
# USAGE
metadata_file = "/gpfs/scratch/yb2612/dl4med_25/dl_project/scratch_data/hpl-clip/lung_subsample_clinical_clusters.csv"

epoch = 27

for model in ["BarlowTwins_3"]:
    for set in ["test"]:
        print(f"Processing {model}/{set} data...")
        embedding_file = f"/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/{model}/epoch_{epoch}/dataframes/{set}/image_embeddings.npy"
        filenames_file = f"/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/{model}/epoch_{epoch}/dataframes/{set}/image_filenames.npy"
        save_dir = f"/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/{model}/epoch_{epoch}/dataframes/{set}/leiden"

        merged_df = load_data(metadata_file, embedding_file, filenames_file)
        merged_df = run_umap(merged_df)
        merged_df = run_leiden(merged_df)

        # Save the dataframe
        output_file = os.path.join(save_dir, "umap_leiden_results.csv")
        merged_df.to_csv(output_file, index=False)
        print(f"Saved UMAP + Leiden results to {output_file}")

Processing VICReg_5/test data...
Loading metadata and embeddings...
Running UMAP...


  warn(


Running Leiden clustering...



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=resolution, key_added=f'leiden_{resolution}')


Saved UMAP + Leiden results to /gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/VICReg_5/epoch_20/dataframes/test/leiden/umap_leiden_results.csv
