In [19]:
import os
import shutil
import tarfile
from tqdm.notebook import tqdm

working_dir = "/kaggle/working/"

for item in os.listdir(working_dir):
    item_path = os.path.join(working_dir, item)
    if os.path.isfile(item_path) or os.path.islink(item_path):
        os.remove(item_path)
    elif os.path.isdir(item_path):
        shutil.rmtree(item_path)
print("Cleaned working directory.")

tar_path = "/kaggle/input/brats-2021-task1/BraTS2021_Training_Data.tar"
extract_path = "/kaggle/working/BraTS2021"

if not os.path.exists(extract_path):
    os.makedirs(extract_path)
    print("Extracting tar file... (This takes 2-5 mins)")
    with tarfile.open(tar_path, "r") as tar:
       
        members = tar.getmembers()
        for member in tqdm(members, desc="Extracting files"):
            tar.extract(member, path=extract_path)
    print("Extraction Complete.")
else:
    print("Dataset already extracted.")

Cleaned working directory.
Extracting tar file... (This takes 2-5 mins)


Extracting files:   0%|          | 0/7508 [00:00<?, ?it/s]

  tar.extract(member, path=extract_path)


Extraction Complete.


In [21]:
import nibabel as nib
import numpy as np
import cv2
import matplotlib.pyplot as plt


DATA_DIR = "/kaggle/working/BraTS2021"
PROCESSED_DIR = "/kaggle/working/processed_unet"
IMAGES_DIR = os.path.join(PROCESSED_DIR, "images")
MASKS_DIR = os.path.join(PROCESSED_DIR, "masks")

os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(MASKS_DIR, exist_ok=True)


patients = sorted([p for p in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, p))])


PATIENT_LIMIT = 100 

print(f"Processing {PATIENT_LIMIT} patients out of {len(patients)}...")

slice_count = 0

for patient in tqdm(patients[:PATIENT_LIMIT], desc="Processing Patients"):
    patient_path = os.path.join(DATA_DIR, patient)
    
    # We use FLAIR (best for whole tumor) and SEG (Ground Truth)
    flair_path = os.path.join(patient_path, f"{patient}_flair.nii.gz")
    seg_path = os.path.join(patient_path, f"{patient}_seg.nii.gz")
    
    if not os.path.exists(flair_path) or not os.path.exists(seg_path):
        continue
        
    
    flair_vol = nib.load(flair_path).get_fdata()
    seg_vol = nib.load(seg_path).get_fdata()
    
    
    flair_vol = cv2.normalize(flair_vol, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    

    for i in range(30, 120): 
        mask_slice = seg_vol[:, :, i]
        img_slice = flair_vol[:, :, i]
        
        
        if np.max(mask_slice) > 0:
    
            binary_mask = np.zeros_like(mask_slice)
            binary_mask[mask_slice > 0] = 255 # 255 for white pixel
            binary_mask = binary_mask.astype(np.uint8)
            
            
            img_resized = cv2.resize(img_slice, (128, 128))
            mask_resized = cv2.resize(binary_mask, (128, 128), interpolation=cv2.INTER_NEAREST)
            
        
            filename = f"{patient}_slice_{i}.png"
            cv2.imwrite(os.path.join(IMAGES_DIR, filename), img_resized)
            cv2.imwrite(os.path.join(MASKS_DIR, filename), mask_resized)
            
            slice_count += 1

print(f"Preprocessing Done. Created {slice_count} slices.")

Processing 100 patients out of 1251...


Processing Patients:   0%|          | 0/100 [00:00<?, ?it/s]

Preprocessing Done. Created 6068 slices.


In [22]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate, Dropout
from tensorflow.keras.models import Model

def build_unet(input_shape=(128, 128, 1)):
    inputs = Input(input_shape)
    
    # ENCODER 

    c1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    c1 = Conv2D(32, (3, 3), activation='relu', padding='same')(c1)
    p1 = MaxPooling2D((2, 2))(c1)

    c2 = Conv2D(64, (3, 3), activation='relu', padding='same')(p1)
    c2 = Conv2D(64, (3, 3), activation='relu', padding='same')(c2)
    p2 = MaxPooling2D((2, 2))(c2)
    
    c3 = Conv2D(128, (3, 3), activation='relu', padding='same')(p2)
    c3 = Conv2D(128, (3, 3), activation='relu', padding='same')(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    # BOTTLENECK 
    c4 = Conv2D(256, (3, 3), activation='relu', padding='same')(p3)
    c4 = Conv2D(256, (3, 3), activation='relu', padding='same')(c4)

    # DECODER 
    u5 = UpSampling2D((2, 2))(c4)
    u5 = Concatenate()([u5, c3]) # Skip Connection
    c5 = Conv2D(128, (3, 3), activation='relu', padding='same')(u5)
    c5 = Conv2D(128, (3, 3), activation='relu', padding='same')(c5)

    u6 = UpSampling2D((2, 2))(c5)
    u6 = Concatenate()([u6, c2]) # Skip Connection
    c6 = Conv2D(64, (3, 3), activation='relu', padding='same')(u6)
    c6 = Conv2D(64, (3, 3), activation='relu', padding='same')(c6)

    u7 = UpSampling2D((2, 2))(c6)
    u7 = Concatenate()([u7, c1]) # Skip Connection
    c7 = Conv2D(32, (3, 3), activation='relu', padding='same')(u7)
    c7 = Conv2D(32, (3, 3), activation='relu', padding='same')(c7)

    # Output Layer
    outputs = Conv2D(1, (1, 1), activation='sigmoid')(c7)

    return Model(inputs=[inputs], outputs=[outputs])

model = build_unet()
model.summary()

In [23]:
from sklearn.model_selection import train_test_split

# 1. Load Data from disk into RAM
# Note: For massive datasets, use a DataGenerator. For 100 patients, RAM is usually fine.
images_list = sorted(os.listdir(IMAGES_DIR))
masks_list = sorted(os.listdir(MASKS_DIR))

X = []
y = []

print("Loading data into memory...")
for img_name, mask_name in zip(images_list, masks_list):
    # Read Image (Grayscale)
    img = cv2.imread(os.path.join(IMAGES_DIR, img_name), cv2.IMREAD_GRAYSCALE)
    mask = cv2.imread(os.path.join(MASKS_DIR, mask_name), cv2.IMREAD_GRAYSCALE)
    
    # Normalize 0-1
    X.append(img / 255.0)
    y.append(mask / 255.0)

# Convert to Numpy Arrays and reshape for Keras (Height, Width, Channel)
X = np.array(X).reshape(-1, 128, 128, 1)
y = np.array(y).reshape(-1, 128, 128, 1)

# Split Train/Test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Define Metrics (Dice Score)
def dice_coef(y_true, y_pred, smooth=1.0):
    y_true_f = tf.keras.backend.flatten(y_true)
    y_pred_f = tf.keras.backend.flatten(y_pred)
    intersection = tf.keras.backend.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (tf.keras.backend.sum(y_true_f) + tf.keras.backend.sum(y_pred_f) + smooth)

# 3. Compile and Train
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[dice_coef])

print("Starting Training...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,    # Adjust based on time available
    batch_size=32,
    verbose=1
)

Loading data into memory...
Starting Training...
Epoch 1/15
[1m  8/152[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m15:07[0m 6s/step - dice_coef: 0.0393 - loss: 0.5948

KeyboardInterrupt: 