# exp002  
データセットの作成（プロト）  
コードはスクリプト化する。  
データセット名は`dataset001`とし、本リポジトリ`data`直下に配置する。  
[Notion](https://www.notion.so/exp002-bb2a83196eba4feea0ae273917540f54?pvs=4)

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydicom
import nibabel as nib
import warnings
from tqdm import tqdm

%matplotlib inline
warnings.filterwarnings("ignore")

# リポジトリtopに移動
while os.path.basename(os.getcwd()) != 'rsna-2023':
    os.chdir('../')
    if os.getcwd() == '/':
        raise Exception('Could not find project root directory.')
    
from src.data_io import load_dicom_series

In [2]:
df_train = pd.read_csv('data/rsna-2023-abdominal-trauma-detection/train.csv')
df_train_image_level = pd.read_csv('data/rsna-2023-abdominal-trauma-detection/image_level_labels.csv')
df_train_serirs_meta = pd.read_csv('data/rsna-2023-abdominal-trauma-detection/train_series_meta.csv')

base_dir = "data/rsna-2023-abdominal-trauma-detection"
dataset_dir = "data/dataset001"

In [3]:
len(df_train_serirs_meta)

4711

# 全dicom画像をHU値で.npy形式(int16)に変換・保存  

In [4]:
for idx, (pid, sid) in tqdm(enumerate(zip(df_train_serirs_meta["patient_id"], df_train_serirs_meta["series_id"]))):
    series_path = os.path.join(base_dir, "train_images", str(pid), str(sid))
    image_arr, path_list, meta_list = load_dicom_series(series_path)
    output_dir = os.path.join(dataset_dir, "train_images", str(pid), str(sid))
    os.makedirs(output_dir, exist_ok=True)
    for idx, path in enumerate(path_list):
        path = path.replace(".dcm", ".npy")
        np.save(os.path.join(output_dir, path), image_arr[idx])

4711it [16:02:19, 12.26s/it]


# セグメンテーション画像をマルチラベル.npz(uint8)に変換・保存  
スライスごとに保存。(C, H, W)のchannel first

In [None]:
labels_dict = {
    1: 'liver',
    2: 'spleen',
    3: 'kidney_left',
    4: 'kidney_right',
    5: 'bowel',
}

def match_orientation(img: np.ndarray) -> np.ndarray:
    """niftyの向きをdicomの向きに合わせる.
    Args:
        img (numpy.ndarray): 読み込んで、npに変換しただけのnifty画像.
    Reference:
        https://www.kaggle.com/code/parhammostame/construct-3d-arrays-from-dcm-nii-3-view-angles
    """
    img = np.transpose(img, [1, 0, 2])
    img = np.rot90(img, 1, (1,2))
    img = img[::-1,:,:]
    img = np.transpose(img, [1, 0, 2])
    return img

def get_any_labels_info(seg_arr: np.ndarray) -> dict:
    """シリーズ単位の、どのラベルが存在するかの情報を取得する."""
    labels_info = dict()
    any_flag = 0
    for label_id, label_name in labels_dict.items():
        exist_flag = 1 if (seg_arr == label_id).any() else 0
        labels_info[label_name] = exist_flag
        any_flag += exist_flag
    labels_info["any"] = min(1, any_flag)
    
    return labels_info

def make_multilabel_mask(seg_arr: np.ndarray) -> np.ndarray:
    """マルチラベルのマスク画像を作成."""
    mask = np.zeros((len(labels_dict),)+seg_arr.shape, dtype=np.uint8)
    for idx,label_id in enumerate(labels_dict.keys()):
        mask[idx] = seg_arr == label_id
    return mask

df_seg_info_series = list()
df_seg_info_image = list()
base_dir = "data/rsna-2023-abdominal-trauma-detection/segmentations/"
for sid in os.listdir(base_dir):
    sid = int(sid.replace(".nii", ""))
    pid = df_train_serirs_meta[df_train_serirs_meta["series_id"] == sid]["patient_id"].values[0]
    print(sid)
    path = os.path.join(base_dir, f"{sid}.nii")
    nii_img = nib.load(path)
    seg_arr = nii_img.get_fdata()
    seg_arr = np.array(seg_arr, dtype=np.uint8)
    seg_arr = match_orientation(seg_arr)
    ct_images_dir = f"data/rsna-2023-abdominal-trauma-detection/train_images/{pid}/{sid}"
    path_list = os.listdir(ct_images_dir)
    path_list = [[int(path.replace(".dcm","")), path] for path in path_list]
    path_list.sort()
    assert len(path_list) == seg_arr.shape[0]
    for idx, (image_id, path) in enumerate(path_list):
        img = seg_arr[idx]
        image_level_dict = get_any_labels_info(img)
        image_level_dict["patient_id"] = pid
        image_level_dict["series_id"] = sid
        image_level_dict["image_id"] = image_id
        image_level_dict["image_index"] = idx
        df_seg_info_image.append(image_level_dict)
        if img.sum() > 0:
            img_multilabel = make_multilabel_mask(img)
            os.makedirs(f"data/dataset001/segmentations/{sid}", exist_ok=True)
            np.savez_compressed(f"data/dataset001/segmentations/{sid}/{image_id}.npz", img_multilabel)
            
    series_level_dict = get_any_labels_info(seg_arr)
    series_level_dict["patient_id"] = pid
    series_level_dict["series_id"] = sid
    df_seg_info_series.append(series_level_dict)

In [None]:
df_seg_info_image = pd.DataFrame(df_seg_info_image)
df_seg_info_series = pd.DataFrame(df_seg_info_series)
df_seg_info_image.to_csv("data/dataset001/seg_info_image.csv")
df_seg_info_series.to_csv("data/dataset001/seg_info_series.csv")