In [1]:
import json
import os
import random
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

def split_dataset():
    output_dir = '/opt/ml/input/data'
    annotation = '/opt/ml/input/data/train_all.json'
    with open(annotation) as f: dataset = json.load(f)

    images = dataset['images']
    annotations = dataset['annotations']
    categories = dataset['categories']

    # file_name에 prefix 디렉토리까지 포함 (CocoDataset 클래스를 사용하는 경우)
    # for image in images:
    #     image['file_name'] = '{}/{}'.format(image['file_name'][0], image['file_name'])
    
    var = [(ann['image_id'], ann['category_id']) for ann in annotations]
    X = np.ones((len(dataset['annotations']),1))
    y = np.array([v[1] for v in var])
    groups = np.array([v[0] for v in var])

    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
        train_images = [x for x in images if x.get('id') in groups[train_idx]]
        val_images = [x for x in images if x.get('id') in groups[val_idx]]
        train_annotations = [x for x in annotations if x.get('image_id') in groups[train_idx]]
        val_annotations = [x for x in annotations if x.get('image_id') in groups[val_idx]]
        
        train_data = {
            'images': train_images,
            'annotations': train_annotations,
            'categories': categories,
        }

        val_data = {
            'images': val_images,
            'annotations': val_annotations,
            'categories': categories,
        }

        output_train_json = os.path.join(output_dir, f'fold_{fold_idx}_train.json')
        output_val_json = os.path.join(output_dir, f'fold_{fold_idx}_val.json')

        with open(output_train_json, 'w') as train_writer:
            json.dump(train_data, train_writer)
        print(f'write {output_train_json}')

        with open(output_val_json, 'w') as val_writer:
            json.dump(val_data, val_writer)
        print(f'write {output_val_json}')

In [2]:
split_dataset()

write /opt/ml/input/data/fold_0_train.json
write /opt/ml/input/data/fold_0_val.json
write /opt/ml/input/data/fold_1_train.json
write /opt/ml/input/data/fold_1_val.json
write /opt/ml/input/data/fold_2_train.json
write /opt/ml/input/data/fold_2_val.json
write /opt/ml/input/data/fold_3_train.json
write /opt/ml/input/data/fold_3_val.json
write /opt/ml/input/data/fold_4_train.json
write /opt/ml/input/data/fold_4_val.json


In [6]:
import pandas as pd
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs, index = [], []

for fold in range(5):
    train, val = f'/opt/ml/input/data/fold_{fold}_train.json', f'/opt/ml/input/data/fold_{fold}_val.json'
    with open(train) as f1: train_dataset = json.load(f1)
    with open(val) as f2: val_dataset = json.load(f2)
    
    train_y = np.array([ann['category_id'] for ann in train_dataset['annotations']])
    val_y = np.array([ann['category_id'] for ann in val_dataset['annotations']])
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold}')
    index.append(f'val - fold{fold}')

train, val = '/opt/ml/input/data/train.json', '/opt/ml/input/data/val.json'
with open(train) as f1: train_dataset = json.load(f1)
with open(val) as f2: val_dataset = json.load(f2)

train_y = np.array([ann['category_id'] for ann in train_dataset['annotations']])
val_y = np.array([ann['category_id'] for ann in val_dataset['annotations']])

distrs.append(get_distribution(train_y))
distrs.append(get_distribution(val_y))
index.append('train')
index.append('val')


annotation = '/opt/ml/input/data/train_all.json'
with open(annotation) as f: data = json.load(f)

data_y = np.array([ann['category_id'] for ann in data['annotations']])
distrs.append(get_distribution(data_y)) 
index.append('total')
categories =  ['Background', 'General trash', 'Paper', 'Paper pack', 'Metal',
                'Glass', 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing']
y = np.array([ann['category_id'] for ann in data['annotations']])
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,Background,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
train - fold0,0.00%,10.81%,35.57%,2.49%,2.05%,2.26%,11.75%,5.12%,29.04%,0.24%,0.68%
val - fold0,0.00%,9.80%,35.15%,2.60%,2.52%,2.60%,11.87%,5.12%,29.47%,0.23%,0.66%
train,0.00%,10.60%,35.49%,2.51%,2.14%,2.33%,11.78%,5.12%,29.13%,0.24%,0.67%
val,0.00%,10.61%,35.47%,2.51%,2.15%,2.32%,11.77%,5.12%,29.11%,0.25%,0.69%
total,0.00%,10.60%,35.48%,2.51%,2.14%,2.32%,11.78%,5.12%,29.13%,0.24%,0.67%
