## Data preparation - sketch to pixel art (Model A3)

### Create pairs of sketches and pixel art images

- TinyHero dataset source: https://www.kaggle.com/datasets/calmness/retro-pixel-characters-generator
- Data for transfer learning:
    - Anime Faces dataset source: https://www.kaggle.com/datasets/soumikrakshit/anime-faces
    - Pokemon data source: https://www.kaggle.com/datasets/zackseliger/pokemon-images-includes-fakemon
    - Animal Pack game assets source: https://kenney.nl/assets/animal-pack-redux
- Data preparation for model A3 uses data from the `datasets/unprepared_data/input_a3` folder.
- Prepared data will be saved in `datasets/model_a3_data`

In [1]:
import os, glob, random
import common_functions as core

# Get path for files needed to prepare data for transfer learning (base model A3)
TRANSFER_LEARNING_DATA = core.get_path(core.A3_BASE.raw_data_dir)
# Get path for files needed to prepare data for model A3
CHARACTER_DATA = core.get_path(core.A3.raw_data_dir)

# File paths to save prepared transfer learning data to train base model A3
TL_INPUT_DATA = core.get_path(core.A3_BASE.prepared_data_dir, f"{core.A3_BASE.dataset_path_prefix}input")
TL_OUTPUT_DATA = core.get_path(core.A3_BASE.prepared_data_dir, f"{core.A3_BASE.dataset_path_prefix}output")
TL_COMBINED_DATA = core.get_path(core.A3_BASE.prepared_data_dir, f"{core.A3_BASE.dataset_path_prefix}combined")

# File paths to save prepared data for training model A3
PIX2PIX_INPUT_DATA = core.get_path(core.A3.prepared_data_dir, f"{core.A3.dataset_path_prefix}input")
PIX2PIX_OUTPUT_DATA = core.get_path(core.A3.prepared_data_dir, f"{core.A3.dataset_path_prefix}output")
PIX2PIX_COMBINED_DATA = core.get_path(core.A3.prepared_data_dir, f"{core.A3.dataset_path_prefix}combined")

random.seed(187)

# Sorted list of images needed to prepare transfer learning data (data for base model A3)
transfer_learn_images = sorted(list(glob.glob(TRANSFER_LEARNING_DATA + "/*.png")))
# Sorted list of TinyHero images to prepare data for model A3
character_images = sorted(list(glob.glob(CHARACTER_DATA + "/*.png")))

# Shuffle lists of images
random.shuffle(transfer_learn_images)
random.shuffle(character_images)

# Folder names to store train, test, validation data
split_folders = ("train", "test", "val")

## Folder structure for model A3 data folder

```
model_a3_data
    # Prepared data for Pix2Pix model
    -- pix2pix_input (input images)
        -- train
        -- test
        -- val
    -- pix2pix_output (target images)
        -- train
        -- test
        -- val
    -- pix2pix_combined (combined input, target image pairs to use for Pix2Pix training)
        -- train
        -- test
        -- val

    # Prepared data for transfer learning
    -- tl_input (input images)
            -- train
            -- test
            -- val
    -- tl_output (target images)
            -- train
            -- test
            -- val
    -- tl_combined (combined input, target image pairs to use for Pix2Pix training)
            -- train
            -- test
            -- val
```

In [2]:
def prepare_data(input_images, model_inp_dir, model_out_dir, split = (0.8, 0.9, 1.0)):
    counter = 0
    train_end = int(len(input_images) * split[0])
    test_end = int(len(input_images) * split[1])
    sketch_per_img = 3
    for image in input_images:
        magenta_bg_image = core.pil_to_opencv(core.load_64x64_with_magenta_bg(image))
        for sketch in range(1, sketch_per_img + 1):
            sketch_image = core.create_sketch(magenta_bg_image, levels=sketch, magenta_bg=True)
            filename = f"{counter:05}.png"
            current_split_folder = split_folders[0]
            if train_end < counter < test_end:
                current_split_folder = split_folders[1]
            elif counter >= test_end:
                current_split_folder = split_folders[2]
            # ----------- Write to model_input and output directories -----------
            model_input_path = os.path.join(model_inp_dir, current_split_folder)
            os.makedirs(model_input_path, exist_ok=True)
            core.write_image(sketch_image, os.path.join(model_input_path, filename))
            
            model_out_path = os.path.join(model_out_dir, current_split_folder)
            os.makedirs(model_out_path, exist_ok=True)
            core.write_image(magenta_bg_image, os.path.join(model_out_path, filename))
            
            counter += 1

In [3]:
prepare_data(transfer_learn_images, TL_INPUT_DATA, TL_OUTPUT_DATA)

In [4]:
prepare_data(character_images, PIX2PIX_INPUT_DATA, PIX2PIX_OUTPUT_DATA)

In [5]:
os.makedirs(TL_COMBINED_DATA, exist_ok=True)
core.create_combined_images(f"--fold_A {TL_INPUT_DATA} --fold_B {TL_OUTPUT_DATA} --fold_AB {TL_COMBINED_DATA} --no_multiprocessing")

----------------------------------------------------------
python /Users/tashvit/Documents/GitHub/mmpixagen/thirdparty/pix2pix/datasets/combine_A_and_B.py --fold_A /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_input/ --fold_B /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_output/ --fold_AB /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_combined/ --no_multiprocessing
[fold_A] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_input/
[fold_B] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_output/
[fold_AB] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/tl_combined/
[num_imgs] =  1000000
[use_AB] =  False
[no_multiprocessing] =  True
split = test, use 338/338 images
split = test, number of images = 338
split = train, use 2709/2709 images
split = train, number of images = 2709
split = val, use 7111/7111 images
split = val, number of images = 7111


In [6]:
os.makedirs(PIX2PIX_COMBINED_DATA, exist_ok=True)
core.create_combined_images(f"--fold_A {PIX2PIX_INPUT_DATA} --fold_B {PIX2PIX_OUTPUT_DATA} --fold_AB {PIX2PIX_COMBINED_DATA} --no_multiprocessing")

----------------------------------------------------------
python /Users/tashvit/Documents/GitHub/mmpixagen/thirdparty/pix2pix/datasets/combine_A_and_B.py --fold_A /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_input/ --fold_B /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_output/ --fold_AB /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_combined/ --no_multiprocessing
[fold_A] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_input/
[fold_B] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_output/
[fold_AB] =  /Users/tashvit/Documents/GitHub/mmpixagen/datasets/model_a3_data/pix2pix_combined/
[num_imgs] =  1000000
[use_AB] =  False
[no_multiprocessing] =  True
split = test, use 90/90 images
split = test, number of images = 90
split = train, use 730/730 images
split = train, number of images = 730
split = val, use 1916/1916 images
split = val, number of images