In [9]:
import pandas as pd

# Load the full subsample CSV
lung_subsample = pd.read_csv(
    "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/LUAD vs LUSC lung type/TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_leiden_2p0__fold4_subsample.csv",
    header=0
)

# Construct full filepaths
lung_subsample['filepath'] = lung_subsample.apply(
    lambda row: f"/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/{row['original_set']}/{row['slides']}/{row['tiles']}",
    axis=1
)


# Replace 'valid' with 'val' in the 'filepath' column
lung_subsample['filepath'] = lung_subsample['filepath'].str.replace('valid', 'val')

# Load caption CSVs
captions_dir = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hpl-clip/long_consistent_captions"
train_captions = pd.read_csv(f"{captions_dir}/lung_train_filepath_caption.csv")
val_captions = pd.read_csv(f"{captions_dir}/lung_val_filepath_caption.csv")
test_captions = pd.read_csv(f"{captions_dir}/lung_test_filepath_caption.csv")

# Directory to save results
results_dir = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hpl-clip/long_consistent_captions"

# Merge with captions and save
def merge_and_save(subset_name, subset_df, caption_df):
    merged = pd.merge(subset_df[['filepath']], caption_df, on='filepath', how='inner')
    merged.to_csv(f"{results_dir}/lung_{subset_name}_subsample_filepath_caption.csv", index=False)

merge_and_save("train", lung_subsample[lung_subsample['original_set'] == 'train'], train_captions)
merge_and_save("val", lung_subsample[lung_subsample['original_set'] == 'valid'], val_captions)
merge_and_save("test", lung_subsample[lung_subsample['original_set'] == 'test'], test_captions)

In [13]:
import numpy as np
import pandas as pd

# Load saved .npy files
embeddings = np.load("/gpfs/home/yb2612/dl4med_25/dl_project/results/lung_test_subsample_clip_embeddings.npy")
filepaths = np.load("/gpfs/home/yb2612/dl4med_25/dl_project/results/lung_test_subsample_filenames.npy")

print(embeddings.shape)  # (n_samples, 512) for ViT-B-32
print(len(filepaths))    # should match n_samples

(29860, 512)
29860


In [14]:
# Convert embeddings to list of vectors
img_z_latent = [emb for emb in embeddings]

# Convert filepaths to DataFrame
df_embed = pd.DataFrame({
    "filepath": filepaths,
    "img_z_latent": img_z_latent
})

# Merge with your original metadata (e.g. lung_val)
lung_test_embeddings = lung_test.merge(df_embed, on="filepath", how="inner")

# Optional sanity check
lung_test_embeddings

Unnamed: 0,indexes,labels,luad,original_set,os_event_data,os_event_ind,patterns,slides,tiles,leiden_2.0,filepath,img_z_latent
0,75891,4.0,1,test,1.906849,1.0,TCGA-LUAD_stage_iv,TCGA-L9-A5IP-01Z-00-DX1,27_13.jpeg,21,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.04586151, 0.0047151954, -0.032217763, 0.035..."
1,144579,8.0,0,test,14.071233,1.0,TCGA-LUSC_stage_iii,TCGA-33-4586-01Z-00-DX1,29_31.jpeg,23,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.048177395, 0.00055845553, -0.024108294, 0.0..."
2,82707,6.0,0,test,61.019178,1.0,TCGA-LUSC_stage_i,TCGA-51-6867-01Z-00-DX1,42_12.jpeg,29,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.04119467, -0.0011202622, -0.033073902, 0.03..."
3,127263,7.0,0,test,2.893151,1.0,TCGA-LUSC_stage_ii,TCGA-NC-A5HL-01Z-00-DX1,37_15.jpeg,27,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.041049853, -0.013593568, -0.03477461, 0.027..."
4,2372,1.0,1,test,69.336986,0.0,TCGA-LUAD_stage_i,TCGA-55-6980-01Z-00-DX1,6_19.jpeg,7,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.041572277, -0.004507632, -0.02551377, 0.032..."
...,...,...,...,...,...,...,...,...,...,...,...,...
29855,137200,8.0,0,test,14.071233,1.0,TCGA-LUSC_stage_iii,TCGA-33-4586-01Z-00-DX2,28_19.jpeg,3,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.04282359, 0.0072973305, -0.025575154, 0.025..."
29856,75657,4.0,1,test,31.134247,0.0,TCGA-LUAD_stage_iv,TCGA-86-7701-01Z-00-DX1,22_11.jpeg,43,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.044410508, -0.0031977003, -0.031851176, 0.0..."
29857,74478,3.0,1,test,53.293151,0.0,TCGA-LUAD_stage_iii,TCGA-49-6743-01Z-00-DX2,61_34.jpeg,8,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.0240903, -0.010693876, -0.031577908, 0.0304..."
29858,72686,3.0,1,test,53.293151,0.0,TCGA-LUAD_stage_iii,TCGA-49-6743-01Z-00-DX2,25_40.jpeg,19,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...,"[0.041391127, -0.00020595707, -0.02155271, 0.0..."


In [15]:
lung_test_embeddings.isna().sum()

indexes          0
labels           0
luad             0
original_set     0
os_event_data    0
os_event_ind     0
patterns         0
slides           0
tiles            0
leiden_2.0       0
filepath         0
img_z_latent     0
dtype: int64

In [17]:
lung_test_embeddings.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/lung_test_embeddings.csv", index=False)