In [1]:
import os
import json
import shutil

In [2]:
# 변수 선언

# 데이터셋의 루트 디렉토리 설정
root_dir = "dataset/2DBB"
preprocessed_root_dir = "dataset/Preprocessed_2DBB"

# 하위 디렉토리들
sub_dirs = ["training", "validation", "test"]

data_num = 0
data_num_count = 0

class_mapping = {
    "car": 0,
    "bus": 1,
    "truck": 2,
    "special vehicle": 3,
    "motorcycle": 4,
    "bicycle": 5,
    "personal mobility": 6,
    "person": 7,
    "Traffic_light": 8,
    "Traffic_sign": 9
}

yaml_class = [
    "car",
    "bus",
    "truck",
    "special vehicle",
    "motorcycle",
    "bicycle",
    "personal mobility",
    "person",
    "Traffic_light",
    "Traffic_sign"
]

In [3]:
# 모든 JSON 파일을 탐색하고 'Annotation' 값을 추출 후 'data'의 좌표값이 4개인지 확인

for sub_dir in sub_dirs:
    label_dir = os.path.join(root_dir, sub_dir, 'labels')

    for filename in os.listdir(label_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(label_dir, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)
                if 'Annotation' in data:
                    for obj in data['Annotation']:
                        data_num += 1
                        if len(obj['data']) != 4:
                            print(f"File: {file_path}")
                            print(f"data: {obj['data']}\n")
                        else:
                            data_num_count += 1
                else:
                    print(f"File: {file_path} does not contain 'Annotation' key.\n")

print(f"bbox 데이터 {data_num} 중 {data_num_count}개 정상(좌표 4개로 이루어짐)")
if data_num == data_num_count:
    print("bbox 데이터에 문제 없음")
else:
    print("bbox 데이터에 문제 있음")

bbox 데이터 532152 중 532152개 정상(좌표 4개로 이루어짐)
bbox 데이터에 문제 없음


In [4]:
# 모든 JSON 파일을 YOLO 형식의 txt로 변환 후 이미지와 함께 새로운 디렉토리로 복사

for sub_dir in sub_dirs:
    label_dir = os.path.join(root_dir, sub_dir, 'labels')
    image_dir = os.path.join(root_dir, sub_dir, 'images')
    new_label_dir = label_dir.replace('2DBB', 'Preprocessed_2DBB')
    new_image_dir = image_dir.replace('2DBB', 'Preprocessed_2DBB')
    
    os.makedirs(new_label_dir, exist_ok=True)
    os.makedirs(new_image_dir, exist_ok=True)

    for filename in os.listdir(label_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(label_dir, filename)
            with open(file_path, 'r') as file:
                #print(f"File: {file_path}")
                yolo_annotations = []
                data = json.load(file)
                img_width, img_height = data['image_size']

                for annotation in data['Annotation']:
                    class_name = annotation['class_name']
                    class_id = class_mapping.get(class_name, -1)
                    x_min, y_min, x_max, y_max = annotation['data']

                    x_center = (x_min + x_max) / 2 / img_width
                    y_center = (y_min + y_max) / 2 / img_height
                    width = (x_max - x_min) / img_width
                    height = (y_max - y_min) / img_height

                    yolo_annotation = [class_id, x_center, y_center, width, height]

                    yolo_annotations.append(yolo_annotation)
            
            # 텍스트 파일로 저장
            output_file = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(new_label_dir, output_file)

            with open(output_path, 'w') as outfile:
                for annotation in yolo_annotations:
                    outfile.write(' '.join(map(str, annotation)) + '\n')
                    
            image_file = os.path.splitext(filename)[0] + '.jpg'
            source_image_path = os.path.join(image_dir, image_file)
            target_image_path = os.path.join(new_image_dir, image_file)
            
            if os.path.isfile(source_image_path) == True:
                shutil.copyfile(source_image_path, target_image_path)
                
                #print(f'image 저장: {target_image_path}')
                #print(f'label 저장: {output_path} \n')

In [5]:
# 이미지 파일 중 txt가 없는 결측 파일 확인 후 삭제

for sub_dir in sub_dirs:
    label_dir = os.path.join(preprocessed_root_dir, sub_dir, 'labels')
    image_dir = os.path.join(preprocessed_root_dir, sub_dir, 'images')
    
    label_filenames = os.listdir(label_dir)
    image_filenames = os.listdir(image_dir)

    missing_images = [filename for filename in label_filenames if os.path.splitext(filename)[0] + '.jpg' not in image_filenames]
    
    print(f"디렉토리: {sub_dir}")
    print("삭제할 이미지:", missing_images)
    
    file_num = 0
    for filename in missing_images:
        file_path = os.path.join(label_dir, filename)
        os.remove(file_path)
        file_num += 1
    print(f"총 {file_num}개의 파일을 삭제했습니다.")
    print('')

디렉토리: training
삭제할 이미지: ['S_DRG_230629_008_FC_069.txt', 'S_DRG_230629_008_FR_014.txt', 'S_DRG_230629_008_RR_099.txt', 'S_DRG_230629_008_FL_079.txt', 'S_DRG_230629_009_RR_043.txt', 'S_DRG_230629_008_RL_079.txt', 'S_DRG_230629_009_FR_117.txt', 'S_DRG_230629_008_FR_010.txt', 'S_DRG_230629_009_RL_043.txt', 'S_DRG_230629_008_FC_078.txt', 'S_DRG_230629_009_FL_071.txt', 'S_DRG_230629_008_FL_052.txt', 'S_DRG_230629_009_RL_071.txt', 'S_DRG_230629_008_FC_067.txt', 'S_DRG_230629_009_RL_050.txt', 'S_DRG_230629_009_FC_026.txt', 'S_DRG_230629_009_RL_116.txt', 'S_DRG_230629_009_FR_010.txt', 'S_DRG_230629_009_FR_112.txt', 'S_DRG_230629_009_RR_105.txt', 'S_DRG_230629_009_FL_115.txt', 'S_DRG_230629_008_FL_031.txt', 'S_DRG_230629_009_RL_110.txt', 'S_DRG_230629_008_RL_098.txt', 'S_DRG_230629_008_FL_093.txt', 'S_DRG_230629_009_RR_046.txt', 'S_DRG_230629_008_FC_055.txt', 'S_DRG_230629_008_FL_020.txt', 'S_DRG_230629_009_FC_054.txt', 'S_DRG_230629_008_RR_106.txt', 'S_DRG_230629_009_RR_100.txt', 'S_DRG_230629_

In [6]:
# 디렉토리 안의 파일 개수 확인

for sub_dir in sub_dirs:
    images_dir = os.path.join(preprocessed_root_dir, sub_dir, "images")
    labels_dir = os.path.join(preprocessed_root_dir, sub_dir, "labels")
    
    images_count = len(os.listdir(images_dir))
    labels_count = len(os.listdir(labels_dir))
    
    print(f"디렉토리: {sub_dir}")
    print(f"images 개수: {images_count}")
    print(f"labels 개수: {labels_count}")
    print('')

디렉토리: training
images 개수: 79739
labels 개수: 79739

디렉토리: validation
images 개수: 9965
labels 개수: 9965

디렉토리: test
images 개수: 9969
labels 개수: 9969



In [10]:
# data.yaml 파일 생성

file_path = os.path.join(preprocessed_root_dir,'data.yaml')
train_path = '../training/images'
val_path = '../validation/images'
test_path = '../test/images'

data = f'''train: {train_path}
val: {val_path}
test: {test_path}

nc: {len(yaml_class)}
names: {yaml_class}
'''

with open(file_path, 'w') as file:
    file.write(data)

print(f"{file_path}가 생성되었습니다.")

dataset/Preprocessed_2DBB/data.yaml가 생성되었습니다.
