# Extracting all HPL embeddings

This notebook takes all image embeddings from the original HPL pipeline from a subsample of tiles for test, train, and validation sets.

**Output:** Numpy arrays of image embeddings (`image_embeddings.npy`) and corresponding filepaths (`image_filenames.npy`) in `test`, `train`, and `val` subfolders. Make sure to adjust filepaths to your specific directory setup as you go through this notebook.

In [1]:
import h5py

file_path = "/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_complete_TCGAFFPE_LUADLUSC_5x_60pc_250K_metadata.h5"

with h5py.File(file_path, 'r') as f:
    print(list(f.keys()))
    
    # img_h_latent = f['img_h_latent'][:]
    img_z_latent = f['img_z_latent'][:]
    
print("img_z_latent:", img_z_latent.shape)
print(img_z_latent[:5,:5])

['img_h_latent', 'img_z_latent', 'indexes', 'labels', 'luad', 'original_set', 'patterns', 'slides', 'tiles']
img_z_latent: (552612, 128)
[[ 1.6670141   0.38279754 -0.3874799  -2.0465286   1.8501118 ]
 [-0.7151375   1.660493   -0.27868125 -1.0255438   1.535439  ]
 [-0.5575665  -0.77273774  0.75791293 -0.14159928 -1.5666388 ]
 [-1.0825367  -0.5284177  -2.5313396  -1.6833458   0.13807176]
 [-2.0774322  -0.1368722  -1.0204186   2.1976364   2.4070396 ]]


In [2]:
import pandas as pd

file_path = "/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_complete_TCGAFFPE_LUADLUSC_5x_60pc_250K_metadata.h5"

# Assuming 'f' is the h5py File object containing your data
with h5py.File(file_path, 'r') as f:
    # List all keys in the file
    keys = list(f.keys())
    
    # Select keys to exclude 'img_h_latent' and 'img_z_latent'
    exclude_keys = ['img_h_latent', 'img_z_latent']
    selected_keys = [key for key in keys if key not in exclude_keys]
    
    # Extract data for the selected keys and create a DataFrame
    data = {key: f[key][:] for key in selected_keys}
    lung_h5 = pd.DataFrame(data)
    # lung_h5.set_index('indexes', inplace=True)
    # Remove byte string 'b' from all columns where necessary
    lung_h5 = lung_h5.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)

# Show the resulting DataFrame
lung_h5

  lung_h5 = lung_h5.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)


Unnamed: 0,indexes,labels,luad,original_set,patterns,slides,tiles
0,0,3.0,1,train,TCGA-LUAD_stage_iii,TCGA-49-6745-01Z-00-DX7,46_14.jpeg
1,1,6.0,0,train,TCGA-LUSC_stage_i,TCGA-68-A59J-01Z-00-DX1,23_9.jpeg
2,2,2.0,1,train,TCGA-LUAD_stage_ii,TCGA-95-A4VN-01Z-00-DX1,41_5.jpeg
3,3,3.0,1,train,TCGA-LUAD_stage_iii,TCGA-49-4490-01Z-00-DX6,83_25.jpeg
4,4,7.0,0,train,TCGA-LUSC_stage_ii,TCGA-43-6647-01Z-00-DX1,36_20.jpeg
...,...,...,...,...,...,...,...
552607,148732,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,30_25.jpeg
552608,148733,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,26_29.jpeg
552609,148734,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,4_8.jpeg
552610,148735,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,27_22.jpeg


In [3]:
import pandas as pd

# add img_z_latent col
lung_h5['img_z_latent'] = list(img_z_latent)

lung_h5['filepath'] = lung_h5.apply(
        lambda row: f"/gpfs/scratch/yb2612/dl4med_25/dl_project/data/scratch_data/{row['original_set']}/{row['slides']}/{row['tiles']}",
        axis=1
    )

# Replace 'valid' with 'val' in the 'filepath' column
lung_h5['filepath'] = lung_h5['filepath'].str.replace('valid', 'val')

lung_h5

Unnamed: 0,indexes,labels,luad,original_set,patterns,slides,tiles,img_z_latent,filepath
0,0,3.0,1,train,TCGA-LUAD_stage_iii,TCGA-49-6745-01Z-00-DX7,46_14.jpeg,"[1.6670141, 0.38279754, -0.3874799, -2.0465286...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
1,1,6.0,0,train,TCGA-LUSC_stage_i,TCGA-68-A59J-01Z-00-DX1,23_9.jpeg,"[-0.7151375, 1.660493, -0.27868125, -1.0255438...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
2,2,2.0,1,train,TCGA-LUAD_stage_ii,TCGA-95-A4VN-01Z-00-DX1,41_5.jpeg,"[-0.5575665, -0.77273774, 0.75791293, -0.14159...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
3,3,3.0,1,train,TCGA-LUAD_stage_iii,TCGA-49-4490-01Z-00-DX6,83_25.jpeg,"[-1.0825367, -0.5284177, -2.5313396, -1.683345...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
4,4,7.0,0,train,TCGA-LUSC_stage_ii,TCGA-43-6647-01Z-00-DX1,36_20.jpeg,"[-2.0774322, -0.1368722, -1.0204186, 2.1976364...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
...,...,...,...,...,...,...,...,...,...
552607,148732,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,30_25.jpeg,"[-2.2802196, 1.319538, -0.24407692, 0.37196088...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
552608,148733,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,26_29.jpeg,"[-1.7288797, 1.5980006, 1.2009984, -0.6883178,...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
552609,148734,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,4_8.jpeg,"[0.903218, 0.20381093, -0.36375058, -1.0651255...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
552610,148735,9.0,0,test,TCGA-LUSC_stage_iv,TCGA-18-3417-01Z-00-DX1,27_22.jpeg,"[2.048098, -0.59234655, -0.6684551, 1.5173385,...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...


In [4]:
train_df = lung_h5[lung_h5["original_set"] == "train"][["filepath", "img_z_latent"]]
valid_df = lung_h5[lung_h5["original_set"] == "valid"][["filepath", "img_z_latent"]]
test_df  = lung_h5[lung_h5["original_set"] == "test"][["filepath", "img_z_latent"]]

print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(249001, 2)
(155733, 2)
(147878, 2)


In [7]:
import numpy as np

def save_split(df, output_dir):
    filepaths = df["filepath"].to_numpy()
    embeddings = np.stack(df["img_z_latent"].to_numpy())
    np.save(f"{output_dir}/image_filenames.npy", filepaths)
    np.save(f"{output_dir}/image_embeddings.npy", embeddings)

In [8]:

save_split(train_df, "//gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/train")
save_split(valid_df, "/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/val")
save_split(test_df, "/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/test")

In [9]:
filenames1 = np.load("/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/test/image_filenames.npy", allow_pickle=True)
#filenames2 = np.load("/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/VICReg_0/dataframes/test/image_filenames.npy", allow_pickle=True)
print(filenames1.shape)
#print(filenames2.shape)
#np.array_equal(filenames1, filenames2)
#np.array_equal(filenames1)

(147878,)


# Extracting subsample of HPL embeddings

### I want to take the same patients as Yumi in the subsample but use my embeddings ###

In [10]:
import numpy as np
import pandas as pd

lung_subsample = pd.read_csv("/gpfs/scratch/yb2612/dl4med_25/dl_project/scratch_data/LUAD vs LUSC lung type/TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_leiden_2p0__fold4_subsample.csv", header=0)
lung_subsample['sampleID'] = lung_subsample['slides'].str[:15]

lung_subsample = lung_subsample.copy()

lung_subsample['filepath'] = lung_subsample.apply(
    lambda row: f"/gpfs/scratch/yb2612/dl4med_25/dl_project/data/scratch_data/{row['original_set']}/{row['slides']}/{row['tiles']}",
    axis=1
)

# Replace 'valid' with 'val' in the 'filepath' column
lung_subsample['filepath'] = lung_subsample['filepath'].str.replace('valid', 'val')
lung_subsample

Unnamed: 0,indexes,labels,luad,original_set,os_event_data,os_event_ind,patterns,slides,tiles,leiden_2.0,sampleID,filepath
0,250215,3.0,1,train,29.753425,1.0,TCGA-LUAD_stage_iii,TCGA-49-4512-01Z-00-DX5,83_6.jpeg,27,TCGA-49-4512-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
1,383757,6.0,0,train,36.591781,0.0,TCGA-LUSC_stage_i,TCGA-85-8580-01Z-00-DX1,5_31.jpeg,5,TCGA-85-8580-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
2,119891,1.0,1,train,19.528767,1.0,TCGA-LUAD_stage_i,TCGA-62-A46P-01Z-00-DX1,27_34.jpeg,36,TCGA-62-A46P-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
3,87582,1.0,1,train,15.189041,0.0,TCGA-LUAD_stage_i,TCGA-55-8087-01Z-00-DX1,3_9.jpeg,6,TCGA-55-8087-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
4,147969,2.0,1,train,16.043836,1.0,TCGA-LUAD_stage_ii,TCGA-49-6742-01Z-00-DX5,2_31.jpeg,26,TCGA-49-6742-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,216554,3.0,1,train,35.539726,1.0,TCGA-LUAD_stage_iii,TCGA-49-4494-01Z-00-DX5,8_14.jpeg,25,TCGA-49-4494-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
199996,87890,1.0,1,train,21.468493,0.0,TCGA-LUAD_stage_i,TCGA-86-8358-01Z-00-DX1,21_22.jpeg,44,TCGA-86-8358-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
199997,210189,2.0,1,train,16.043836,1.0,TCGA-LUAD_stage_ii,TCGA-49-6742-01Z-00-DX2,66_27.jpeg,0,TCGA-49-6742-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
199998,307622,6.0,0,train,151.265753,1.0,TCGA-LUSC_stage_i,TCGA-33-4583-01Z-00-DX2,25_11.jpeg,4,TCGA-33-4583-01,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...


In [11]:
merged_df = lung_h5.merge(lung_subsample, on="filepath", how="right")
print(merged_df.shape)
merged_df["filepath"].duplicated().sum()

(200017, 20)


36

In [12]:
#Checking similar filepaths to be able to do Leiden clustering for test 

filepaths_only = pd.DataFrame({'filepath': lung_subsample['filepath']})
common_filepaths = set(filepaths_only['filepath']).intersection(set(lung_h5['filepath']))
print(f"\nNumber of common filepaths: {len(common_filepaths)}")
print(f"Total filepaths in lung_subsample: {len(filepaths_only)}")
print(f"Total filepaths in lung_h5: {len(lung_h5)}")


Number of common filepaths: 122494
Total filepaths in lung_subsample: 200000
Total filepaths in lung_h5: 552612


In [13]:
# Filter only the test samples because that is what we will be using for Leiden 
test_samples = lung_subsample[lung_subsample['original_set'] == 'test'].copy()
test_filepaths_only = pd.DataFrame({'filepath': test_samples['filepath']})

print(f"Total samples in original dataset: {len(lung_subsample)}")
print(f"Number of test samples: {len(test_samples)}")
print(f"Number of unique file paths: {test_filepaths_only['filepath'].nunique()}")
print(f"Columns in filtered dataset: {test_filepaths_only.columns.tolist()}")

test_filepaths_only.head()

Total samples in original dataset: 200000
Number of test samples: 29860
Number of unique file paths: 29860
Columns in filtered dataset: ['filepath']


Unnamed: 0,filepath
9,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
10,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
14,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
24,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
49,/gpfs/scratch/yb2612/dl4med_25/dl_project/data...


In [14]:
# Now merge with your larger dataset (lung_h5)
merged_df_h5 = lung_h5.merge(test_filepaths_only, on="filepath")

merged_df_h5.head()

Unnamed: 0,indexes,labels,luad,original_set,patterns,slides,tiles,img_z_latent,filepath
0,1125,1.0,1,test,TCGA-LUAD_stage_i,TCGA-L9-A8F4-01Z-00-DX1,43_21.jpeg,"[-0.69408673, -1.8036858, 1.3596141, -1.594538...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
1,1127,1.0,1,test,TCGA-LUAD_stage_i,TCGA-62-A46R-01Z-00-DX1,10_5.jpeg,"[-0.49005964, -1.2694932, 0.34914026, -0.80261...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
2,1128,1.0,1,test,TCGA-LUAD_stage_i,TCGA-44-2657-01Z-00-DX1,1_8.jpeg,"[1.573003, 1.2773548, -0.4838334, -0.4451968, ...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
3,1129,1.0,1,test,TCGA-LUAD_stage_i,TCGA-50-7109-01Z-00-DX1,44_28.jpeg,"[-0.49412078, -1.5519458, 0.4412166, 0.868419,...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...
4,1132,1.0,1,test,TCGA-LUAD_stage_i,TCGA-L9-A8F4-01Z-00-DX1,35_27.jpeg,"[-1.2963312, -0.9693767, 0.5376099, 0.14410585...",/gpfs/scratch/yb2612/dl4med_25/dl_project/data...


In [15]:
print(len(merged_df_h5))

29860


In [16]:
#train_subsample_df = merged_df[merged_df["original_set_x"] == "train"][["filepath", "img_z_latent"]]
#valid_subsample_df = merged_df[merged_df["original_set_x"] == "valid"][["filepath", "img_z_latent"]]
test_subsample_df  = merged_df_h5[["filepath", "img_z_latent"]]

#print(train_subsample_df.shape)
#print(valid_subsample_df.shape)
print(test_subsample_df.shape)

(29860, 2)


In [17]:
#save_split(train_subsample_df, "/gpfs/home/yb2612/dl4med_25/dl_project/results/hpl/train")
#save_split(valid_subsample_df, "/gpfs/home/yb2612/dl4med_25/dl_project/results/hpl/val")

save_split(test_subsample_df, "/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/test")

In [18]:
epoch = 20 

filenames1 = np.load("/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/test/image_filenames.npy", allow_pickle=True)
filenames2 = np.load("/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/epoch_27/dataframes/test/image_embeddings.npy", allow_pickle=True)

print(filenames1.shape)
print(filenames2.shape)

(29860,)
(29860, 128)
