# create_csv.ipynb

## Description
This jupyter notebook creates a csv file based on the directory you want to finetune.
Specifically, this code will take the file and create train, val, and test splits for your directory.
The csv file will be placed in the preceding directory. It is important to run this for your dataloader. The csv file will be the same name as the directory. The random split probabilities will most accurate as number of samples increase.

In [1]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

Bruno DS

In [2]:
bruno_dataset = Path(
    "/local_mount/space/mayday/data/users/zachs/fast-mri-ldm/pre_processing/preprocess-bruno-2025-data"
)
bruno_dst_dir = Path(
    "/local_mount/space/mayday/data/datasets/ladyyy/datasets/med_vae_train/bruno_all_slices"
)
os.makedirs(bruno_dst_dir, exist_ok=True)

In [4]:
# load csv file
df = pd.read_csv(bruno_dataset / "data" / "final_dataset.csv")
display(df.head())

Unnamed: 0,exam_id,scanner,scan_type,file_path,dimensionality,Nx,Ny,Nz,recon_done,cg_recon_path,...,include_fista,z_start,z_end,anatomy,is_flipped,fov_over_2_shift,view_done,recon_html,location,name
0,10382,chmr-1,SAG T2 Cube,/local_mount/space/mayday/data/datasets/bruno/...,3D,256,257,330,True,./data/recons/3DFSE/10382_SAG T2 Cube/recon_cg...,...,False,40,275,sagittal,True,False,True,/my_static/data/recons/3DFSE/10382_SAG T2 Cube...,./data/recons/3DFSE/10382_SAG T2 Cube,10382_3DFSE_SAG_T2_Cube
1,10382,chmr-1,Sag T2 FLAIR Cube FS,/local_mount/space/mayday/data/datasets/bruno/...,3D,256,257,330,True,./data/recons/3DFSE/10382_Sag T2 FLAIR Cube FS...,...,True,44,280,sagittal,True,False,True,/my_static/data/recons/3DFSE/10382_Sag T2 FLAI...,./data/recons/3DFSE/10382_Sag T2 FLAIR Cube FS,10382_3DFSE_Sag_T2_FLAIR_Cube_FS
2,10382,chmr-1,Ax 3D T1 (BRAVO),/local_mount/space/mayday/data/datasets/bruno/...,3D,256,256,256,True,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO)/r...,...,True,20,216,axial,True,False,True,/my_static/data/recons/EFGRE3D/10382_Ax 3D T1 ...,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO),10382_EFGRE3D_Ax_3D_T1_(BRAVO)
3,10382,chmr-1,Ax 3D T1 (BRAVO) POST,/local_mount/space/mayday/data/datasets/bruno/...,3D,256,256,256,True,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO) P...,...,True,8,220,axial,True,False,True,/my_static/data/recons/EFGRE3D/10382_Ax 3D T1 ...,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO) POST,10382_EFGRE3D_Ax_3D_T1_(BRAVO)_POST
4,10382,chmr-1,Ax MFAST post,/local_mount/space/mayday/data/datasets/bruno/...,3D,256,312,314,True,./data/recons/mfastv1814/10382_Ax MFAST post/r...,...,False,0,-1,axial,False,False,True,/my_static/data/recons/mfastv1814/10382_Ax MFA...,./data/recons/mfastv1814/10382_Ax MFAST post,10382_mfastv1814_Ax_MFAST_post


In [None]:
save_df = []
global_cnt = 0

train_pct = 0.7
val_pct = 0.15
test_pct = 0.15

# Create a list of probabilities
probabilities = [train_pct, val_pct, test_pct]
splits = ["train", "val", "test"]

for j in tqdm(range(len(df))):
    row = df.iloc[j]

    # split per patient
    split = np.random.choice(splits, p=probabilities)

    z_start = row["z_start"]
    z_end = row["z_end"]

    has_recon = row["include_fista"] or row["include_cg"]

    if not has_recon:
        continue

    if row["include_fista"]:
        recon = np.load(bruno_dataset / row["fista_recon_path"][2:])
    elif row["include_cg"]:
        recon = np.load(bruno_dataset / row["cg_recon_path"][2:])

    recon = recon[..., z_start : z_end + 1]

    if row["fov_over_2_shift"]:
        recon = np.fft.fftshift(recon, axes=1)

    for i in tqdm(range(recon.shape[-1])):
        recon_slice = recon[..., i]
        rand_id = random.randint(0, 1000000)
        filename = f"{row["exam_id"]}_{row["scanner"]}_{row["scan_type"]}_fista_{i}_{rand_id}.npy".replace(
            " ", "_"
        ).replace(
            "/", "_"
        )
        file_id = filename.split(".")[0]
        np.save(bruno_dst_dir / filename, recon_slice)

        save_df.append({"row_nr": global_cnt, "image_uuid": file_id, "split": split})
        global_cnt += 1

# save_df = pd.DataFrame(save_df)
# # Save the DataFrame as a CSV file
# save_df.to_csv(bruno_dest_dir.parent, 'bruno_ds.csv', index=False)

100%|██████████| 237/237 [00:24<00:00,  9.59it/s]
100%|██████████| 197/197 [00:18<00:00, 10.49it/s]
100%|██████████| 213/213 [00:19<00:00, 10.69it/s]
100%|██████████| 181/181 [00:15<00:00, 11.49it/s]
100%|██████████| 169/169 [00:18<00:00,  9.13it/s]
100%|██████████| 67/67 [00:07<00:00,  9.53it/s]t]
100%|██████████| 45/45 [00:05<00:00,  8.87it/s]t]
100%|██████████| 186/186 [00:16<00:00, 11.51it/s]
100%|██████████| 177/177 [00:15<00:00, 11.32it/s]]
100%|██████████| 171/171 [00:17<00:00,  9.69it/s]]
100%|██████████| 75/75 [00:08<00:00,  8.38it/s]it]
100%|██████████| 53/53 [00:06<00:00,  7.62it/s]it]
100%|██████████| 172/172 [00:20<00:00,  8.33it/s]]
100%|██████████| 163/163 [00:16<00:00,  9.83it/s]]
100%|██████████| 49/49 [00:05<00:00,  8.84it/s]it]
100%|██████████| 41/41 [00:04<00:00,  9.06it/s]]  
100%|██████████| 69/69 [00:10<00:00,  6.89it/s]]
100%|██████████| 47/47 [00:06<00:00,  6.94it/s]]
100%|██████████| 185/185 [00:19<00:00,  9.36it/s]
100%|██████████| 113/113 [00:13<00:00,  8.60

In [None]:
save_df = pd.DataFrame(save_df)
# Save the DataFrame as a CSV file
save_df.to_csv(bruno_dst_dir.parent / "bruno_ds.csv", index=False)

DICOMS

In [2]:
dicom_dataset = Path(
    "/local_mount/space/mayday/data/users/zachs/fast-mri-ldm/pre_processing/preprocess-bruno-2025-data"
)
dicom_dst_dir = Path(
    "/local_mount/space/mayday/data/datasets/ladyyy/datasets/med_vae_train/bruno_dicoms_v2"
)
os.makedirs(dicom_dst_dir, exist_ok=True)

In [4]:
df = pd.read_csv(dicom_dataset / "data" / "updated_dataset_w_dicoms_v2.csv")
display(df.head())

Unnamed: 0,exam_id,scanner,scan_type,file_path,dimensionality,Nx,Ny,Nz,recon_done,cg_recon_path,...,z_end,anatomy,is_flipped,fov_over_2_shift,view_done,recon_html,location,name,new_scan_path,prior_scan_path
0,10382,chmr-1,SAG T2 Cube,/local_mount/space/mayday/data/datasets/bruno/...,3D,256.0,257.0,330.0,1.0,./data/recons/3DFSE/10382_SAG T2 Cube/recon_cg...,...,275.0,sagittal,1.0,0.0,1.0,/my_static/data/recons/3DFSE/10382_SAG T2 Cube...,./data/recons/3DFSE/10382_SAG T2 Cube,10382_3DFSE_SAG_T2_Cube,/local_mount/space/mayday/data/users/zachs/fas...,/local_mount/space/mayday/data/users/zachs/fas...
1,10382,chmr-1,Sag T2 FLAIR Cube FS,/local_mount/space/mayday/data/datasets/bruno/...,3D,256.0,257.0,330.0,1.0,./data/recons/3DFSE/10382_Sag T2 FLAIR Cube FS...,...,280.0,sagittal,1.0,0.0,1.0,/my_static/data/recons/3DFSE/10382_Sag T2 FLAI...,./data/recons/3DFSE/10382_Sag T2 FLAIR Cube FS,10382_3DFSE_Sag_T2_FLAIR_Cube_FS,/local_mount/space/mayday/data/users/zachs/fas...,/local_mount/space/mayday/data/users/zachs/fas...
2,10382,chmr-1,Ax 3D T1 (BRAVO),/local_mount/space/mayday/data/datasets/bruno/...,3D,256.0,256.0,256.0,1.0,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO)/r...,...,216.0,axial,1.0,0.0,1.0,/my_static/data/recons/EFGRE3D/10382_Ax 3D T1 ...,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO),10382_EFGRE3D_Ax_3D_T1_(BRAVO),/local_mount/space/mayday/data/users/zachs/fas...,/local_mount/space/mayday/data/users/zachs/fas...
3,10382,chmr-1,Ax 3D T1 (BRAVO) POST,/local_mount/space/mayday/data/datasets/bruno/...,3D,256.0,256.0,256.0,1.0,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO) P...,...,220.0,axial,1.0,0.0,1.0,/my_static/data/recons/EFGRE3D/10382_Ax 3D T1 ...,./data/recons/EFGRE3D/10382_Ax 3D T1 (BRAVO) POST,10382_EFGRE3D_Ax_3D_T1_(BRAVO)_POST,/local_mount/space/mayday/data/users/zachs/fas...,
4,10382,chmr-1,Ax MFAST post,/local_mount/space/mayday/data/datasets/bruno/...,3D,256.0,312.0,314.0,1.0,./data/recons/mfastv1814/10382_Ax MFAST post/r...,...,-1.0,axial,0.0,0.0,1.0,/my_static/data/recons/mfastv1814/10382_Ax MFA...,./data/recons/mfastv1814/10382_Ax MFAST post,10382_mfastv1814_Ax_MFAST_post,/local_mount/space/mayday/data/users/zachs/fas...,/local_mount/space/mayday/data/users/zachs/fas...


In [5]:
save_df = []
global_cnt = 0

train_pct = 0.7
val_pct = 0.15
test_pct = 0.15

# Create a list of probabilities
probabilities = [train_pct, val_pct, test_pct]
splits = ["train", "val", "test"]

for j in tqdm(range(len(df))):
    row = df.iloc[j]
    # Randomly assign a split for this row (this split will be used for each saved slice)
    split = np.random.choice(splits, p=probabilities)

    # Process both "new" and "prior" scans.
    for status in ["new", "prior"]:
        # Get the scan path for the current status.
        scan_path = row.get(f"{status}_scan_path", None)
        if pd.isnull(scan_path) or not os.path.exists(scan_path):
            continue

        try:
            # Load the volume (assumed to be a 3D numpy array)
            volume = np.load(scan_path)
        except Exception as e:
            print(f"Error loading volume at {scan_path}: {e}")
            continue

        # Compute energy for each slice along the first dimension.
        # Energy is defined as the sum of absolute pixel values.
        energies = np.array(
            [np.sum(np.abs(volume[i, ...]) ** 2) for i in range(volume.shape[0])]
        )
        max_energy = energies.max()
        threshold = 0.3 * max_energy

        # Find indices of slices that meet the energy threshold.
        valid_indices = [i for i, e in enumerate(energies) if e >= threshold]
        if not valid_indices:
            print(f"No slices meet the energy threshold in volume at {scan_path}")
            continue

        # Iterate over valid slices.
        for i in valid_indices:
            slice_img = volume[i, ...]
            rand_id = random.randint(0, 1000000)
            filename = f"{row['exam_id']}_{row['scanner']}_{row['scan_type']}_{status}_{i}_{rand_id}.npy"
            filename = filename.replace(" ", "_").replace("/", "_")
            file_id = filename.split(".")[0]
            save_path = os.path.join(dicom_dst_dir, filename)
            try:
                np.save(save_path, slice_img)
            except Exception as e:
                print(f"Error saving slice {i} for {scan_path}: {e}")
                continue

            save_df.append(
                {
                    "row_nr": global_cnt,
                    "image_uuid": file_id,
                    "split": split,
                }
            )
            global_cnt += 1

  0%|          | 0/631 [00:16<?, ?it/s]


KeyboardInterrupt: 

In [None]:
save_df = pd.DataFrame(save_df)
# Save the DataFrame as a CSV file
save_df.to_csv(dicom_dst_dir.parent / "dicom_ds.csv", index=False)

In [None]:
## Note: CODE TO CHANGE HERE ##

# Example data here is mammogram data for training
DATA_PATH = os.path.abspath("../medvae/data/mmg_data")

train_pct = 0.6
val_pct = 0.2
test_pct = 0.2

# Create a list of probabilities
probabilities = [train_pct, val_pct, test_pct]
splits = ["train", "val", "test"]

# Make sure the sum of the percentages is 1
assert sum(probabilities) == 1

# Make sure splits == pcts
assert len(splits) == len(probabilities)

In [None]:
# Iterate
data_files = os.listdir(DATA_PATH)

# Shuffle the data files
random.seed(42)
random.shuffle(data_files)

save_df = []

# Iterate through all the files in the data directory
for i, data_file in tqdm(enumerate(data_files), total=len(data_files)):
    file_id = data_file.split(".")[0]

    save_df.append(
        {
            "row_nr": i,
            "image_uuid": file_id,
            # Randomly assign the split, with 60% train, 20% val, 20% test
            "split": np.random.choice(splits, p=probabilities),
        }
    )

# Create a pandas DataFrame from the save_df list
save_df = pd.DataFrame(save_df)

# Save the DataFrame as a CSV file
save_df.to_csv(
    os.path.join(
        "/".join(DATA_PATH.split("/")[:-1]), f'{DATA_PATH.split("/")[-1]}.csv'
    ),
    index=False,
)

100%|██████████| 10/10 [00:00<00:00, 7588.75it/s]
