In [19]:
# Make a copick project
import os
import shutil

config_blob = """{
    "name": "czii_cryoet_mlchallenge_2024",
    "description": "2024 CZII CryoET ML Challenge training data.",
    "version": "1.0.0",

    "pickable_objects": [
        {
            "name": "apo-ferritin",
            "is_particle": true,
            "pdb_id": "4V1W",
            "label": 1,
            "color": [  0, 117, 220, 128],
            "radius": 60,
            "map_threshold": 0.0418
        },
        {
          "name" : "beta-amylase",
            "is_particle": true,
            "pdb_id": "8ZRZ",
            "label": 2,
            "color": [255, 255, 255, 128],
            "radius": 90,
            "map_threshold": 0.0578  
        },
        {
            "name": "beta-galactosidase",
            "is_particle": true,
            "pdb_id": "6X1Q",
            "label": 3,
            "color": [ 76,   0,  92, 128],
            "radius": 90,
            "map_threshold": 0.0578
        },
        {
            "name": "ribosome",
            "is_particle": true,
            "pdb_id": "6EK0",
            "label": 4,
            "color": [  0,  92,  49, 128],
            "radius": 150,
            "map_threshold": 0.0374
        },
        {
            "name": "thyroglobulin",
            "is_particle": true,
            "pdb_id": "6SCJ",
            "label": 5,
            "color": [ 43, 206,  72, 128],
            "radius": 130,
            "map_threshold": 0.0278
        },
        {
            "name": "virus-like-particle",
            "is_particle": true,
            "label": 6,
            "color": [255, 204, 153, 128],
            "radius": 135,
            "map_threshold": 0.201
        },
        {
            "name": "membrane",
            "is_particle": false,
            "label": 8,
            "color": [100, 100, 100, 128]
        },
        {
            "name": "background",
            "is_particle": false,
            "label": 9,
            "color": [10, 150, 200, 128]
        }
    ],

    "overlay_root": "./kaggle/working/overlay",

    "overlay_fs_args": {
        "auto_mkdir": true
    },

    "static_root": "./kaggle/input/czii-cryo-et-object-identification/train/static"
}"""

copick_config_path = "./kaggle/working/copick.config"
output_overlay = "./kaggle/working/overlay"


with open(copick_config_path, "w") as f:
    f.write(config_blob)
    
# Update the overlay
# Define source and destination directories
source_dir = './kaggle/input/czii-cryo-et-object-identification/train/overlay'
destination_dir = './kaggle/working/overlay'

# Walk through the source directory
for root, dirs, files in os.walk(source_dir):
    # Create corresponding subdirectories in the destination
    relative_path = os.path.relpath(root, source_dir)
    target_dir = os.path.join(destination_dir, relative_path)
    os.makedirs(target_dir, exist_ok=True)
    
    # Copy and rename each file
    for file in files:
        if file.startswith("curation_0_"):
            new_filename = file
        else:
            new_filename = f"curation_0_{file}"
            
        
        # Define full paths for the source and destination files
        source_file = os.path.join(root, file)
        destination_file = os.path.join(target_dir, new_filename)
        
        # Copy the file with the new name
        shutil.copy2(source_file, destination_file)
        print(f"Copied {source_file} to {destination_file}")

Copied ./kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/ribosome.json to ./kaggle/working/overlay/ExperimentRuns/TS_5_4/Picks/curation_0_ribosome.json
Copied ./kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/virus-like-particle.json to ./kaggle/working/overlay/ExperimentRuns/TS_5_4/Picks/curation_0_virus-like-particle.json
Copied ./kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/beta-galactosidase.json to ./kaggle/working/overlay/ExperimentRuns/TS_5_4/Picks/curation_0_beta-galactosidase.json
Copied ./kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/beta-amylase.json to ./kaggle/working/overlay/ExperimentRuns/TS_5_4/Picks/curation_0_beta-amylase.json
Copied ./kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/apo-ferritin.json to ./kaggle/working/overlay/ExperimentRuns/TS_5_4/Picks/c

In [20]:
import copick
import numpy as np
from tqdm import tqdm

root = copick.from_file(copick_config_path)

copick_user_name = "copickUtils"
copick_segmentation_name = "paintedPicks"
voxel_size = 10
tomo_tpye_list = ["ctfdeconvolved","denoised","isonetcorrected","wbp"]
################# 1. Choose Type of Data #################
tomo_type = tomo_tpye_list[1]
print(f"Processing \"{tomo_type}\" data")

Processing "denoised" data


  root = copick.from_file(copick_config_path)


In [21]:
from copick_utils.segmentation import segmentation_from_picks
from copick_utils.writers import write
from collections import defaultdict

# Just do this once
generate_masks = True

if generate_masks:
    target_objects = defaultdict(dict)
    for object in root.pickable_objects:
        if object.is_particle:
            target_objects[object.name]['label'] = object.label
            target_objects[object.name]['radius'] = object.radius


    for run in tqdm(root.runs):
        tomo = run.get_voxel_spacing(10)
        tomo = tomo.get_tomogram(tomo_type).numpy()
        target = np.zeros(tomo.shape, dtype=np.uint8)
        for pickable_object in root.pickable_objects:
            pick = run.get_picks(object_name=pickable_object.name, user_id="curation")
            if len(pick):  
                target = segmentation_from_picks.from_picks(pick[0], 
                                                            target, 
                                                            target_objects[pickable_object.name]['radius'] * 0.8, # 3d Mask Size = Radius in mask
                                                            target_objects[pickable_object.name]['label'] # label_value
                                                            )
        write.segmentation(run, target, copick_user_name, name=copick_segmentation_name)

data_dicts = []
for run in tqdm(root.runs):
    tomogram = run.get_voxel_spacing(voxel_size).get_tomogram(tomo_type).numpy()
    segmentation = run.get_segmentations(name=copick_segmentation_name, user_id=copick_user_name, voxel_size=voxel_size, is_multilabel=True)[0].numpy()
    data_dicts.append({"image": tomogram, "label": segmentation})
    


100%|██████████| 7/7 [00:02<00:00,  3.31it/s]
100%|██████████| 7/7 [00:01<00:00,  3.89it/s]


In [None]:
import numpy as np
from pathlib import Path
from skimage import measure
import matplotlib.pyplot as plt

def convert_polygon_to_yolo(label):
    """
    3D 세그멘테이션 마스크의 모든 슬라이스의 폴리곤을 YOLO 포맷으로 변환
    """
    D, H, W = label.shape
    all_slice_labels = {}
    
    # 각 슬라이스에 대해 처리
    for slice_idx in range(D):
        yolo_labels = []
        
        # 각 unique 라벨에 대해 처리
        for label_val in np.unique(label[slice_idx]):
            if label_val == 0:  # 배경 무시
                continue
                
            # 현재 라벨에 대한 이진 마스크 생성
            binary_mask = (label[slice_idx] == label_val)
            
            # 폴리곤 찾기
            contours = measure.find_contours(binary_mask, 0.5)
            
            # 각 컨투어에 대해
            for contour in contours:
                # 폴리곤 포인트 정규화
                points_y = contour[:, 0] / H
                points_x = contour[:, 1] / W
                
                # class와 좌표들을 하나의 문자열로 결합
                coords = []
                for x, y in zip(points_x, points_y):
                    coords.extend([f"{x:.6f}", f"{y:.6f}"])
                
                label_str = f"{int(label_val)-1} {' '.join(coords)}"
                yolo_labels.append(label_str)
        
        if yolo_labels:  # 라벨이 있는 경우만 저장
            all_slice_labels[slice_idx] = yolo_labels
    
    return all_slice_labels

# 데이터셋 디렉토리 생성
train_label_dir = Path('./datasets/labels/train')
train_image_dir = Path('./datasets/images/train')
test_label_dir = Path('./datasets/labels/val')
test_image_dir = Path('./datasets/images/val')

for dir_path in [train_label_dir, train_image_dir, test_label_dir, test_image_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

# 각 볼륨에 대해 처리
for vol_idx, data in enumerate(data_dicts):
    img = data["image"]
    labels_dict = convert_polygon_to_yolo(data["label"])
    
    # 마지막 볼륨인지 확인
    is_test = (vol_idx == len(data_dicts) - 1)
    
    # 저장 경로 설정
    label_dir = test_label_dir if is_test else train_label_dir
    image_dir = test_image_dir if is_test else train_image_dir
    
    # 각 슬라이스에 대해 라벨과 이미지 파일 저장
    for slice_idx, labels in labels_dict.items():
        # 파일명 생성
        base_filename = f"image_{vol_idx:01d}_{slice_idx:03d}"
        
        # 라벨 저장
        with open(label_dir / f"{base_filename}.txt", "w") as f:
            f.write("\n".join(labels))
        
        # 이미지 정규화 및 저장
        slice_img = img[slice_idx]
        norm_img = ((slice_img - slice_img.min()) / (slice_img.max() - slice_img.min()) * 255).astype(np.uint8)
        plt.imsave(image_dir / f"{base_filename}.png", norm_img, cmap='gray')

    # 저장 정보 출력
    dataset_type = "test" if is_test else "train"
    print(f"Saved volume {vol_idx} to {dataset_type} dataset")

Saved volume 0 to train dataset
Saved volume 1 to train dataset
Saved volume 2 to train dataset
Saved volume 3 to train dataset
Saved volume 4 to train dataset
Saved volume 5 to train dataset
Saved volume 6 to test dataset


In [23]:
import json
from pathlib import Path

# JSON 문자열 파싱
config = json.loads(config_blob)
pickable_objects = config["pickable_objects"]

# 클래스 이름 추출 (is_particle=true인 객체만)
classes = [obj["name"] for obj in pickable_objects if obj.get("is_particle", False)]

# yaml 파일 내용 생성
yaml_content = f"""
path: ./  # dataset root dir
train: images/train  # train images
val: images/val  # val images
test:  # test images (optional)

# Classes
names:\n{chr(10).join(f'  {i}: {name}' for i, name in enumerate(classes))}

# Download script/URL (optional)
download: False
"""

# yaml 파일 저장
dataset_dir = Path('./')
dataset_dir.mkdir(parents=True, exist_ok=True)
with open(dataset_dir / 'data.yaml', 'w') as f:
    f.write(yaml_content)

print("Created dataset/data.yaml with classes:", classes)

Created dataset/data.yaml with classes: ['apo-ferritin', 'beta-amylase', 'beta-galactosidase', 'ribosome', 'thyroglobulin', 'virus-like-particle']


In [None]:
import matplotlib.pyplot as plt
from skimage import measure
import numpy as np

def visualize_polygons(image, label, slice_idx=100):
    # 이미지 정규화
    img_norm = (image[slice_idx] - image[slice_idx].min()) / (image[slice_idx].max() - image[slice_idx].min())
    
    # 시각화
    plt.figure(figsize=(15, 5))
    
    # 원본 이미지
    plt.subplot(1, 3, 1)
    plt.title('Normalized Tomogram')
    plt.imshow(img_norm, cmap='gray')
    plt.axis('off')
    
    # 세그멘테이션 마스크
    plt.subplot(1, 3, 2)
    plt.title('Segmentation Mask')
    label_norm = label[slice_idx].astype(float) / label[slice_idx].max()
    mask = plt.imshow(label_norm, cmap='viridis')
    plt.colorbar(mask, label='Normalized Label Values')
    plt.axis('off')
    
    # 폴리곤 오버레이
    plt.subplot(1, 3, 3)
    plt.title('Polygon Overlay')
    plt.imshow(img_norm, cmap='gray')
    
    # 각 라벨에 대해 폴리곤 추출
    unique_labels = np.unique(label[slice_idx])
    colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
    
    for label_val, color in zip(unique_labels[unique_labels > 0], colors):
        # 현재 라벨에 대한 이진 마스크 생성
        binary_mask = (label[slice_idx] == label_val)
        
        # 폴리곤 찾기
        contours = measure.find_contours(binary_mask, 0.5)
        
        # 폴리곤 그리기
        for contour in contours:
            plt.plot(contour[:, 1], contour[:, 0], color=color, linewidth=2, 
                    label=f'Label {label_val}')
    
    # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# 데이터셋의 각 샘플에 대해 시각화
for i in range(len(data_dicts)):
    print(f"\nSample {i}")
    print(f"Unique labels: {np.unique(data_dicts[i]['label'])}")
    visualize_polygons(data_dicts[i]['image'], data_dicts[i]['label'])
