In [None]:
# 1. 挂载 Google Drive (用于读取存储在 Drive 上的 LIDC-IDRI 原始数据集)
from google.colab import drive
drive.mount('/content/drive')

# 2. 从 GitHub 克隆项目代码
# 注意：确保您已经将之前创建的 preprocess/proc_lidc_idri.py 推送到了该仓库
!git clone https://github.com/tanglehunter00/AR-SSL4M-DEMO.git

# 3. 设置项目路径并切换工作目录
project_root = '/content/AR-SSL4M-DEMO' 
import os
if os.path.exists(project_root):
    %cd {project_root}
    print(f"已进入项目根目录: {os.getcwd()}")
else:
    print("项目克隆失败，请检查仓库地址或网络连接。")

# 4. 安装预处理所需的额外依赖
!pip install pydicom SimpleITK monai

# 5. 定义数据集在 Drive 上的路径
# 根据您的结构：My Drive > dataset > LIDC-IDRI > LIDC-IDRI
dataset_path = "/content/drive/MyDrive/dataset/LIDC-IDRI/LIDC-IDRI"

# 6. 执行预处理脚本
# 我们将处理结果保存在项目内的 pretrain/data/patch_random_lidc 目录下
!python preprocess/proc_lidc_idri.py "{dataset_path}"

In [None]:
# 1. 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. 安装必要依赖
!pip install monai nibabel

import os
import random
import time
import shutil
import tarfile
import numpy as np
from monai.transforms import Compose, Resize, ScaleIntensityRangePercentiles

# --- 核心处理逻辑 ---

def load_nii_data(filename):
    import nibabel as nib
    img = nib.load(filename)
    return img.get_fdata(), img.affine

def cut_patch(image, patch_size):
    z, y, x = image.shape
    pz, py, px = patch_size
    z1 = random.randint(0, z - pz)
    y1 = random.randint(0, y - py)
    x1 = random.randint(0, x - px)
    return image[z1:z1+pz, y1:y1+py, x1:x1+px], (z1, z1+pz, y1, y1+py, x1, x1+px)

def load_and_patch_transforms_series(img, tar_img_size):
    transforms = Compose([
        Resize(spatial_size=(tar_img_size[0], tar_img_size[1], tar_img_size[2]), mode='trilinear'),
        ScaleIntensityRangePercentiles(lower=1., upper=99.9, b_min=0.0, b_max=1.0, clip=True, relative=False, channel_wise=False),
    ])
    return transforms(img)

def process_one_case(image_file, patch_size_list, patch_num, save_root, tar_img_size):
    # 本地临时存放点 (Colab 磁盘，非 Drive)
    local_tmp = "/content/temp_case"
    if os.path.exists(local_tmp):
        shutil.rmtree(local_tmp)
    os.makedirs(local_tmp)

    try:
        # 识别后缀
        ext = ".nii.gz" if image_file.endswith(".nii.gz") else ".nii"

        # 解析 ID 用于命名
        path_parts = image_file.split('/')
        ds_name = path_parts[-3] if len(path_parts) > 3 else "BraTS"
        nii_id = os.path.basename(image_file).replace(f'-t1n{ext}', '')

        # 目标压缩包路径
        tar_file_path = os.path.join(save_root, f"{ds_name}_{nii_id}.tar.gz")

        # 检查是否已存在，实现断点续传
        if os.path.exists(tar_file_path):
            print(f"⏩ 跳过已存在的病例: {nii_id}")
            return True

        # 加载 4 个模态
        image, affine = load_nii_data(image_file)
        image_s1, _ = load_nii_data(image_file.replace(f't1n{ext}', f't1c{ext}'))
        image_s2, _ = load_nii_data(image_file.replace(f't1n{ext}', f't2w{ext}'))
        image_s3, _ = load_nii_data(image_file.replace(f't1n{ext}', f't2f{ext}'))

        # 转换坐标轴 -> z,y,x
        image = image.transpose((2, 1, 0))
        image_s1 = image_s1.transpose((2, 1, 0))
        image_s2 = image_s2.transpose((2, 1, 0))
        image_s3 = image_s3.transpose((2, 1, 0))

        # 1. 生成 400 个 npy 文件到本地临时目录
        for i in range(patch_num):
            patch_size = random.choice(patch_size_list)
            image_patch, cut_size = cut_patch(image, patch_size)
            z1, z2, y1, y2, x1, x2 = cut_size

            # 切块并转换
            p0 = load_and_patch_transforms_series(np.expand_dims(image_patch.transpose((2, 1, 0)), 0), tar_img_size).numpy()[0]
            p1 = load_and_patch_transforms_series(np.expand_dims(image_s1[z1:z2, y1:y2, x1:x2].transpose((2, 1, 0)), 0), tar_img_size).numpy()[0]
            p2 = load_and_patch_transforms_series(np.expand_dims(image_s2[z1:z2, y1:y2, x1:x2].transpose((2, 1, 0)), 0), tar_img_size).numpy()[0]
            p3 = load_and_patch_transforms_series(np.expand_dims(image_s3[z1:z2, y1:y2, x1:x2].transpose((2, 1, 0)), 0), tar_img_size).numpy()[0]

            base_name = f"{ds_name}_{nii_id}_{i}"
            np.save(os.path.join(local_tmp, f"{base_name}.t1n.npy"), p0)
            np.save(os.path.join(local_tmp, f"{base_name}.t1c.npy"), p1)
            np.save(os.path.join(local_tmp, f"{base_name}.t2w.npy"), p2)
            np.save(os.path.join(local_tmp, f"{base_name}.t2f.npy"), p3)

        # 2. 将这 400 个文件打包成一个 tar.gz
        with tarfile.open(tar_file_path, "w:gz") as tar:
            tar.add(local_tmp, arcname=nii_id) # 打包并重命名内部文件夹

        # 3. 清理本地临时文件
        shutil.rmtree(local_tmp)
        return True

    except Exception as e:
        print(f"❌ Error processing {image_file}: {e}")
        if os.path.exists(local_tmp):
            shutil.rmtree(local_tmp)
        return False

# --- Colab 运行主逻辑 ---

# 1. 设置输入根目录
INPUT_ROOT = "/content/drive/MyDrive/dataset/pretrain/BraTS23_Data/Data/BraTS-GLI/ASNR-MICCAI-BraTS2023-GLI-Challenge-TrainingData/A-GLI-Part-03"

# 2. 设置输出目录 (建议新开一个目录存放 tar.gz)
SAVE_ROOT = "/content/drive/MyDrive/dataset/pretrain/BraTS23_Data/tar_data"
if not os.path.exists(SAVE_ROOT):
    os.makedirs(SAVE_ROOT)

# 3. 参数设置
patch_num = 100
tar_img_size = [128, 128, 32]
patch_size_list = [(32, 128, 128)]

# 4. 扫描所有病例
print(f"正在扫描目录: {INPUT_ROOT} ...")
all_t1n_files = []
for root, dirs, files in os.walk(INPUT_ROOT):
    for f in files:
        if f.endswith("t1n.nii.gz") or f.endswith("t1n.nii"):
            all_t1n_files.append(os.path.join(root, f))

all_t1n_files.sort()
print(f"找到 {len(all_t1n_files)} 个病例。")

# 5. 开始处理
start_time = time.time()
success_count = 0

for i, file_path in enumerate(all_t1n_files):
    print(f"[{i+1}/{len(all_t1n_files)}] 正在处理并打包: {os.path.basename(file_path)}")
    if process_one_case(file_path, patch_size_list, patch_num, SAVE_ROOT, tar_img_size):
        success_count += 1

    # 每 5 个病例同步一次磁盘，确保安全
    if (i+1) % 5 == 0:
        os.sync()

end_time = time.time()
print(f"\n✨ 处理完成！")
print(f"成功打包: {success_count} 个病例")
print(f"总耗时: {(end_time - start_time)/60:.2f} 分钟")
print(f"所有压缩包已存至: {SAVE_ROOT}")

In [None]:
# 1. 安装必要的库 (MONAI 是原代码处理的核心)
!pip install monai nibabel

import os
import csv
import numpy as np
import nibabel as nib
from glob import glob
from google.colab import drive
from monai.transforms import Compose, Resize, ScaleIntensityRangePercentiles

# 2. 挂载 Google Drive
drive.mount('/content/drive')

# 3. 定义路径（根据你的要求设置）
BASE_PATH = '/content/drive/MyDrive/dataset/pretrain/DeepLesion/data'
DATA_PATH = os.path.join(BASE_PATH, 'Images_nifti')
ANNO_PATH = os.path.join(BASE_PATH, 'DL_info.csv')
SAVE_ROOT = os.path.join(BASE_PATH, 'npy')

# 创建输出目录
if not os.path.exists(SAVE_ROOT):
    os.makedirs(SAVE_ROOT)
    print(f"创建文件夹: {SAVE_ROOT}")

# 4. 核心处理函数 (完全遵循原项目逻辑)
def readCSV(filename):
    lines = []
    with open(filename, "r") as f:
        csvreader = csv.reader(f)
        for line in csvreader:
            lines.append(line)
    return lines

def load_and_patch_transforms(img, tar_img_size):
    """遵循原代码的变换逻辑"""
    transforms = Compose([
        Resize(spatial_size=tar_img_size, mode='trilinear'),
        ScaleIntensityRangePercentiles(
            lower=1., upper=99.9, 
            b_min=0.0, b_max=1.0, 
            clip=True, relative=False, channel_wise=False
        ),
    ])
    return transforms(img)

def process_deeplesion():
    tar_img_size = [128, 128, 32]
    annos = readCSV(ANNO_PATH)
    
    print(f"开始处理，总标注行数: {len(annos)-1}")
    
    # 从索引 1 开始（跳过表头）
    for index in range(1, len(annos)):
        anno = annos[index]
        
        # 准则 1: 必须有粗粒度标签 (anno[9] != '-1')
        if anno[9] == '-1':
            continue

        # 解析信息
        image_name = '_'.join(anno[0].split('_')[:3])
        # 获取 Bounding Box 中心点
        bbox = [float(x.replace('[', '').replace(']', '').strip()) for x in anno[6].split(',')]
        x_center, y_center = int((bbox[0] + bbox[2]) // 2), int((bbox[1] + bbox[3]) // 2)
        # 获取切片范围用于匹配文件
        slice_range = [anno[11].split(',')[0], anno[11].split(',')[-1].split(' ')[-1].strip()]

        # 查找匹配的 NIfTI 文件
        search_pattern = os.path.join(DATA_PATH, f'{image_name}_*{slice_range[0]}-*{slice_range[1]}.nii.gz')
        match_files = glob(search_pattern)
        
        if not match_files:
            # print(f"跳过: 未找到匹配文件 {search_pattern}")
            continue

        nii_name = match_files[0]
        
        try:
            # 加载数据
            nii_img = nib.load(nii_name)
            nii_data = nii_img.get_fdata()
            
            x_shape, y_shape = nii_data.shape[0], nii_data.shape[1]
            
            # 准则 2: 以病灶中心裁剪 128x128 的 Patch
            # 注意：原代码逻辑中使用 y_center 对应 shape[0], x_center 对应 shape[1]
            image_patch = nii_data[
                max(0, y_center - 64): min(y_center + 64, x_shape), 
                max(0, x_center - 64): min(x_center + 64, y_shape), 
                :
            ]
            
            # 准则 3: Resize 128x128x32 & 强度归一化
            # 增加 Channel 维度满足 MONAI 要求 [C, H, W, D]
            image_patch_input = np.expand_dims(image_patch, 0)
            processed_patch = load_and_patch_transforms(image_patch_input, tar_img_size)
            
            # 移除 Channel 维度还原为 [H, W, D]
            final_data = processed_patch.numpy()[0, ...]

            # 准则 4: 保存为 .npy (一病灶一文件)
            # 命名规则: {image_name}_{行索引}_{标签}.npy
            save_name = os.path.join(SAVE_ROOT, f'{image_name}_{index}_{anno[9]}.npy')
            np.save(save_name, final_data)
            
            if index % 100 == 0:
                print(f"已处理 {index} 条记录...")
                
        except Exception as e:
            print(f"错误发生于索引 {index}: {str(e)}")

    print("处理完成！所有文件已保存至 Google Drive。")

# 执行处理
process_deeplesion()