In [5]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dschettler8845_brats_2021_task1_path = kagglehub.dataset_download('dschettler8845/brats-2021-task1')

print('Data source import complete.')


Data source import complete.


In [None]:
import os
import nibabel as nib
import numpy as np
import tarfile

path_to_tar_file = f"{dschettler8845_brats_2021_task1_path}/BraTS2021_Training_Data.tar"
output1_directory = "/content/brats21-dataset-training-validation"

# Open the tar file in read mode
with tarfile.open(path_to_tar_file, "r") as tar_ref:
    # Extract all contents to the specified directory
    tar_ref.extractall(output1_directory)

print(f"Extraction completed. Files are saved to {output1_directory}")

In [8]:
input_directory = "/content/brats21-dataset-training-validation"
output_directory = "brats21-dataset-preprocessed"
modalities = ['flair', 't1', 't1ce', 't2']

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

subjects = [s for s in os.listdir(input_directory) if os.path.isdir(os.path.join(input_directory, s))]

for subject_name in subjects:
    subject_path = os.path.join(input_directory, subject_name)

    volume_list = []
    for mod in modalities:
        modal_path = os.path.join(subject_path, f"{subject_name}_{mod}.nii.gz")
        img = nib.load(modal_path).get_fdata()  # shape: (H, W, D)
        # Transpose to (C, D, H, W) form if needed:
        # For now, we want (4,D,H,W) eventually, so just store D,H,W for each mod and stack later.
        img = np.transpose(img, (2,0,1))  # (D,H,W)
        volume_list.append(img)

    volume = np.stack(volume_list, axis=0).astype(np.float32)  # (4, D, H, W)

    # Load label
    label_path = os.path.join(subject_path, f"{subject_name}_seg.nii.gz")
    lbl = nib.load(label_path).get_fdata() # (H, W, D)
    lbl = np.transpose(lbl, (2,0,1)).astype(np.int64) # (D,H,W)

    # Normalization
    for c in range(volume.shape[0]):
        vol_ch = volume[c]
        vol_ch = (vol_ch - np.mean(vol_ch)) / (np.std(vol_ch) + 1e-8)
        volume[c] = vol_ch

    # Save preprocessed data
    # Using np.savez_compressed reduces file size and is still fairly quick to load
    output_path = os.path.join(output_directory, subject_name + ".npz")
    np.savez_compressed(output_path, volume=volume, label=lbl)

print("Preprocessing completed and cached data saved.")


Preprocessing completed and cached data saved.


In [9]:
!zip -r /content/brats21-dataset-preprocessed.zip /content/brats21-dataset-preprocessed

  adding: content/brats21-dataset-preprocessed/ (stored 0%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01314.npz (deflated 4%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_00579.npz (deflated 2%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_00688.npz (deflated 2%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01270.npz (deflated 4%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_00390.npz (deflated 3%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_00292.npz (deflated 3%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01125.npz (deflated 3%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_00651.npz (deflated 2%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01378.npz (deflated 4%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01377.npz (deflated 4%)
  adding: content/brats21-dataset-preprocessed/BraTS2021_01661.npz (deflated 3%)
  adding: content/brats21-dataset-preprocessed/Br