<a href="https://colab.research.google.com/github/zhangzhejia2002-lgtm/Delta-mmFormer/blob/main/01_non_contrast_raw_data_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
Create a split training dataset (train_splited) from the original LiQA training data.

Patients are grouped by fibrosis stage:
- class0: S1, S2, S3
- class1: S4

Only selected MRI sequences (T1WI, T2WI, DWI_800) are copied.
The original dataset is not modified.
"""

import os
import shutil
import glob

# Configuration
SOURCE_ROOT = '/content/drive/MyDrive/LiQA_training_data'

# Target modalities for the Non-Contrast track
TARGET_SEQUENCES = ['T1.NII.GZ', 'T2.NII.GZ', 'DWI_800.NII.GZ']

# Mapping from fibrosis stage to output subfolder
STAGE_MAP = {
    1: 'class0/S1',
    2: 'class0/S2',
    3: 'class0/S3',
    4: 'class1/S4'
}

TRAIN_SPLIT_ROOT = os.path.join(SOURCE_ROOT, 'train_splited')

# Utility functions
def get_stage(folder_name):
    for s in [1, 2, 3, 4]:
        if folder_name.endswith(f'S{s}'):
            return s
    return None

def create_dirs_for_train_split(root):
    dirs = []
    for cls, substages in [('class0', ['S1', 'S2', 'S3']), ('class1', ['S4'])]:
        for stage in substages:
            d = os.path.join(root, cls, stage)
            dirs.append(d)
            os.makedirs(d, exist_ok=True)
    return dirs

def process_train_split(source_root, output_root):
    stats = {'S1': 0, 'S2': 0, 'S3': 0, 'S4': 0}
    for vendor in os.listdir(source_root):
        vendor_path = os.path.join(source_root, vendor)

        # Skip non-directories, metadata files, and the output folder itself
        if (not os.path.isdir(vendor_path)
                or vendor.endswith(('.csv', '.md'))
                or vendor == 'train_splited'):
            continue

        for patient_folder in os.listdir(vendor_path):
            patient_path = os.path.join(vendor_path, patient_folder)
            if not os.path.isdir(patient_path):
                continue

            stage = get_stage(patient_folder)
            if stage is None:
                continue

            relative_path = STAGE_MAP[stage]
            target_patient_dir = os.path.join(output_root, relative_path, patient_folder)

            # Skip if already processed
            if os.path.exists(target_patient_dir):
                continue

            os.makedirs(target_patient_dir, exist_ok=True)
            stage_name = f'S{stage}'

            nii_files = glob.glob(os.path.join(patient_path, '*.nii.gz'))
            for nii_path in nii_files:
                fname = os.path.basename(nii_path)
                if any(seq in fname.upper() for seq in TARGET_SEQUENCES):
                    dst_path = os.path.join(target_patient_dir, fname)
                    if not os.path.exists(dst_path):
                        shutil.copy(nii_path, dst_path)
                    stats[stage_name] += 1

    print("Split summary:", stats)
    return stats

# Main execution
if __name__ == '__main__':
    print(f"Creating split dataset in: {TRAIN_SPLIT_ROOT}")
    create_dirs_for_train_split(TRAIN_SPLIT_ROOT)
    process_train_split(SOURCE_ROOT, TRAIN_SPLIT_ROOT)