# Reduce datasize

In [27]:
import json 
import pandas as pd
import os
import shutil

## Annotations folder

In [28]:
TRAIN_SIZE = 10000
VAL_SIZE = 1000
TEST_SIZE = 1000

In [29]:
reduced_data_path = 'dataset/reduced_data'
  
os.makedirs(reduced_data_path, exist_ok=True)

In [30]:
main_folders = ['bardata(1031)', 'clsdata(1031)', 'linedata(1028)', 'piedata(1008)']
sub_folders = ['annotations', 'images']  
image_sub_folders = ['train2019', 'val2019', 'test2019']

sub_dir_map = {
    main_folders[0]: "bar",
    main_folders[1]: "cls",
    main_folders[2]: "line",
    main_folders[3]: "pie",
}

for main in main_folders:
    parent_dir = os.path.join(reduced_data_path, main, sub_dir_map[main])

    for sub in sub_folders:
        sub_dir_path = os.path.join(parent_dir, sub)
        os.makedirs(sub_dir_path, exist_ok=True) 

        if sub == "images":
            for image_sub in image_sub_folders:
                os.makedirs(os.path.join(sub_dir_path, image_sub), exist_ok=True)

In [31]:
def reduced_annotation(data, size):
    reduced_data = {}
    reduced_data['licenses'] = data['licenses']
    reduced_data['images'] = []
    reduced_data['annotations'] = []
    reduced_data['categories'] = data['categories']

    reduced_data['images'] = data['images'][:size]
    image_ids = {img["id"] for img in reduced_data['images']}   
    reduced_data['annotations'] = [i for i in data['annotations'] if i['image_id'] in image_ids]

    return reduced_data

In [32]:
data_path = 'dataset/data'
output_path = 'dataset/reduced_data'

In [33]:
%%time

for folder in os.listdir(data_path):
    main_folder_path = os.path.join(data_path, folder)

    sub_folders = os.listdir(main_folder_path)

    sub_folder = sub_folders[0] 
    sub_folder_path = os.path.join(main_folder_path, sub_folder)
    annotations_path = os.path.join(sub_folder_path, "annotations")


    output_annotations_path = os.path.join(output_path, folder, sub_folder, "annotations")

    for file_name in os.listdir(annotations_path):
        if file_name.endswith(".json"):
            input_file = os.path.join(annotations_path, file_name)
            output_file = os.path.join(output_annotations_path, file_name)

            with open(input_file, "r", encoding="utf-8") as f:
                data = json.load(f)

            if "train" in file_name:
                size = TRAIN_SIZE
            elif "val" in file_name:
                size = VAL_SIZE
            elif "test" in file_name:
                size = TEST_SIZE

            reduced_json = reduced_annotation(data, size)

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(reduced_json, f, ensure_ascii=False)


CPU times: total: 44.9 s
Wall time: 48.3 s


## Images folder

In [34]:
def copy_images(data, type, split):
    image_filenames = {img["file_name"] for img in data.get("images", [])}

    if type == 'bar':
        if split == 'train':
            src_path = 'dataset/data/bardata(1031)/bar/images/train2019'
            copy_path = 'dataset/reduced_data/bardata(1031)/bar/images/train2019'
        elif split == 'val':
            src_path = 'dataset/data/bardata(1031)/bar/images/val2019'
            copy_path = 'dataset/reduced_data/bardata(1031)/bar/images/val2019'
        elif split == 'test':
            src_path = 'dataset/data/bardata(1031)/bar/images/test2019'
            copy_path = 'dataset/reduced_data/bardata(1031)/bar/images/test2019'

    elif type == 'pie':
        if split == 'train':
            src_path = 'dataset/data/piedata(1008)/pie/images/train2019'
            copy_path = 'dataset/reduced_data/piedata(1008)/pie/images/train2019'
        elif split == 'val':
            src_path = 'dataset/data/piedata(1008)/pie/images/val2019'
            copy_path = 'dataset/reduced_data/piedata(1008)/pie/images/val2019'
        elif split == 'test':
            src_path = 'dataset/data/piedata(1008)/pie/images/test2019'
            copy_path = 'dataset/reduced_data/piedata(1008)/pie/images/test2019'
    
    elif type == 'line':
        if split == 'train':
            src_path = 'dataset/data/linedata(1028)/line/images/train2019'
            copy_path = 'dataset/reduced_data/linedata(1028)/line/images/train2019'
        elif split == 'val':
            src_path = 'dataset/data/linedata(1028)/line/images/val2019'
            copy_path = 'dataset/reduced_data/linedata(1028)/line/images/val2019'
        elif split == 'test':
            src_path = 'dataset/data/linedata(1028)/line/images/test2019'
            copy_path = 'dataset/reduced_data/linedata(1028)/line/images/test2019'

    elif type == 'cls':
        if split == 'train':
            src_path = 'dataset/data/clsdata(1031)/cls/images/train2019'
            copy_path = 'dataset/reduced_data/clsdata(1031)/cls/images/train2019'
        elif split == 'val':
            src_path = 'dataset/data/clsdata(1031)/cls/images/val2019'
            copy_path = 'dataset/reduced_data/clsdata(1031)/cls/images/val2019'
        elif split == 'test':
            src_path = 'dataset/data/clsdata(1031)/cls/images/test2019'
            copy_path = 'dataset/reduced_data/clsdata(1031)/cls/images/test2019'

    copy_image_path = None  
    copied_count = 0

    for image_name in image_filenames:
        src_image_path = os.path.join(src_path, image_name)
        copy_image_path = os.path.join(copy_path, image_name)

        if os.path.exists(src_image_path):  
            shutil.copy2(src_image_path, copy_image_path)
            copied_count += 1
        else:
            print(f"Cannot found any images in: {src_image_path}")

    if copied_count > 0:
        print(f"{copied_count}/{len(image_filenames)} images copied to {copy_path}\n")
    else:
        print(f"No images are copied to {copy_path}\n")

In [35]:
# Bar
with open('dataset/reduced_data/bardata(1031)/bar/annotations/instancesBar(1031)_train2019.json', "r", encoding="utf-8") as f:
    bar_train = json.load(f)

with open('dataset/reduced_data/bardata(1031)/bar/annotations/instancesBar(1031)_val2019.json', "r", encoding="utf-8") as f:
    bar_val = json.load(f)

with open('dataset/reduced_data/bardata(1031)/bar/annotations/instancesBar(1031)_test2019.json', "r", encoding="utf-8") as f:
    bar_test = json.load(f)

# Pie
with open('dataset/reduced_data/piedata(1008)/pie/annotations/instancesPie(1008)_train2019.json', "r", encoding="utf-8") as f:
    pie_train = json.load(f)

with open('dataset/reduced_data/piedata(1008)/pie/annotations/instancesPie(1008)_val2019.json', "r", encoding="utf-8") as f:
    pie_val = json.load(f)

with open('dataset/reduced_data/piedata(1008)/pie/annotations/instancesPie(1008)_test2019.json', "r", encoding="utf-8") as f:
    pie_test = json.load(f)

# Line
with open('dataset/reduced_data/linedata(1028)/line/annotations/instancesLine(1023)_train2019.json', "r", encoding="utf-8") as f:
    line_train = json.load(f)

with open('dataset/reduced_data/linedata(1028)/line/annotations/instancesLine(1023)_val2019.json', "r", encoding="utf-8") as f:
    line_val = json.load(f)

with open('dataset/reduced_data/linedata(1028)/line/annotations/instancesLine(1023)_test2019.json', "r", encoding="utf-8") as f:
    line_test = json.load(f)

# Cls
with open('dataset/reduced_data/clsdata(1031)/cls/annotations/instancesCls(1031)_train2019.json', "r", encoding="utf-8") as f:
    cls_train = json.load(f)

with open('dataset/reduced_data/clsdata(1031)/cls/annotations/instancesCls(1031)_val2019.json', "r", encoding="utf-8") as f:
    cls_val = json.load(f)

with open('dataset/reduced_data/clsdata(1031)/cls/annotations/instancesCls(1031)_test2019.json', "r", encoding="utf-8") as f:
    cls_test = json.load(f)

In [36]:
%%time

copy_images(bar_train, 'bar', 'train')
copy_images(bar_val, 'bar', 'val')
copy_images(bar_test, 'bar', 'test')

copy_images(pie_train, 'pie', 'train')
copy_images(pie_val, 'pie', 'val')
copy_images(pie_test, 'pie', 'test')

copy_images(line_train, 'line', 'train')
copy_images(line_val, 'line', 'val')
copy_images(line_test, 'line', 'test')

copy_images(cls_train, 'cls', 'train')
copy_images(cls_val, 'cls', 'val')
copy_images(cls_test, 'cls', 'test')

10000/10000 images copied to dataset/reduced_data/bardata(1031)/bar/images/train2019

1000/1000 images copied to dataset/reduced_data/bardata(1031)/bar/images/val2019

1000/1000 images copied to dataset/reduced_data/bardata(1031)/bar/images/test2019

10000/10000 images copied to dataset/reduced_data/piedata(1008)/pie/images/train2019

1000/1000 images copied to dataset/reduced_data/piedata(1008)/pie/images/val2019

1000/1000 images copied to dataset/reduced_data/piedata(1008)/pie/images/test2019

10000/10000 images copied to dataset/reduced_data/linedata(1028)/line/images/train2019

1000/1000 images copied to dataset/reduced_data/linedata(1028)/line/images/val2019

1000/1000 images copied to dataset/reduced_data/linedata(1028)/line/images/test2019

10000/10000 images copied to dataset/reduced_data/clsdata(1031)/cls/images/train2019

1000/1000 images copied to dataset/reduced_data/clsdata(1031)/cls/images/val2019

1000/1000 images copied to dataset/reduced_data/clsdata(1031)/cls/images/