## Import Packages

In [2]:
import pandas as pd
import os

## Config

In [12]:
LOAD_DIR = "D:\\datasets"
SAVE_DIR = "..\\..\\..\\config\\data_config"
CONFIG_DIR = "..\\..\\..\\config\\exp_config"
TRAINER_DIR = "..\\..\\..\\trainer"
DRIVER_DIR = "..\\..\\..\\driver"

In [13]:
config = {
    "dataset_name": "basic_shapes",
    "exp_name": "first_try",
    "import_encoding": False,
    "existing_encoding_path": "",
    "delimiter": "_",
    "index": 0,
    "split": [70, 20, 10],
    "shuffle": True
}

## Create Directories

In [17]:
dataset_load_dir = os.path.join(LOAD_DIR, config["dataset_name"])
if not os.path.exists(dataset_load_dir):
    os.mkdir(dataset_load_dir)
dataset_save_dir = os.path.join(SAVE_DIR, config["dataset_name"])
if not os.path.exists(dataset_save_dir):
    os.mkdir(dataset_save_dir)
exp_save_dir = os.path.join(dataset_save_dir, config["exp_name"])
if not os.path.exists(exp_save_dir):
    os.mkdir(exp_save_dir)

## Directories

In [None]:
exp_config_dir = os.path.join(CONFIG_DIR, config["dataset_name"])
exp_trainer_dir = os.path.join(TRAINER_DIR, config["dataset_name"])
exp_driver_dir = os.path.join(DRIVER_DIR, config["dataset_name"])
if not os.path.exists(exp_config_dir):
    os.mkdir(exp_config_dir)
    os.system("cp " + os.path.join(CONFIG_DIR, "template_config.yml") + os.path.join(exp_config_dir, config["dataset_name"] + "_config.yml"))
if not os.path.exists(exp_trainer_dir):
    os.mkdir(exp_trainer_dir)
if not os.path.exists(exp_driver_dir):
    os.mkdir(exp_driver_dir)

## Load and Process Images

In [18]:
images = os.listdir(dataset_load_dir)
images = {os.path.join(dataset_load_dir, image): image.split(config["delimiter"])[config["index"]] for image in images}
labels = list(set(images.values()))
labels = {label: i for i, label in enumerate(labels)}
images = {path: labels[label] for path, label in images.items()}

## Generate Encodings

In [22]:
encoding_dir = os.path.join(exp_save_dir, "label_to_encoding.csv")
if config["import_encoding"]:
    label_df = pd.read_csv(encoding_settings["existing_encoding_path"])
else:
    label_df = pd.DataFrame.from_dict(labels, orient="index")
    label_df.to_csv(encoding_dir)
    label_df = pd.read_csv(encoding_dir)
    label_df.rename(columns={"Unnamed: 0": "class", "0": "code"}, inplace=True)
label_df.to_csv(encoding_dir, index=False)
print(label_df)

      class  code
0  Triangle     0
1      Star     1
2   Octagon     2
3   Hexagon     3
4  Heptagon     4
5    Square     5
6    Circle     6
7  Pentagon     7
8   Nonagon     8


## Generate DataFrame

In [23]:
path_to_label = pd.DataFrame.from_dict(images, orient="index")
path_to_label_dir = os.path.join(exp_save_dir, "path_to_label_ordered.csv")
path_to_label.to_csv(path_to_label_dir)

## Generate Ordered Version

In [24]:
path_to_label = pd.read_csv(path_to_label_dir)
path_to_label.rename(columns={"Unnamed: 0": "path", "0": "label"}, inplace=True)
path_to_label.to_csv(path_to_label_dir, index = False)
path_to_label.head(10)

Unnamed: 0,path,label
0,D:\datasets\basic_shapes\Circle_000dfc5c-2a92-...,6
1,D:\datasets\basic_shapes\Circle_000ed2d8-2a8a-...,6
2,D:\datasets\basic_shapes\Circle_0013f29e-2a9a-...,6
3,D:\datasets\basic_shapes\Circle_001d7284-2a85-...,6
4,D:\datasets\basic_shapes\Circle_001de166-2a89-...,6
5,D:\datasets\basic_shapes\Circle_001f0442-2a8e-...,6
6,D:\datasets\basic_shapes\Circle_002bf074-2a95-...,6
7,D:\datasets\basic_shapes\Circle_0035a8ac-2a91-...,6
8,D:\datasets\basic_shapes\Circle_003fd9dc-2a99-...,6
9,D:\datasets\basic_shapes\Circle_004542ae-2a86-...,6


In [25]:
path_to_label = path_to_label.sample(frac=1).reset_index(drop=True)
path_to_label.to_csv(os.path.join(exp_save_dir, "path_to_label_shuffled.csv"), index=False)
path_to_label.head(10)

Unnamed: 0,path,label
0,D:\datasets\basic_shapes\Heptagon_a5577b6e-2a8...,4
1,D:\datasets\basic_shapes\Square_0e7774d0-2a97-...,5
2,D:\datasets\basic_shapes\Heptagon_ee647918-2a8...,4
3,D:\datasets\basic_shapes\Circle_cb5596fc-2a8a-...,6
4,D:\datasets\basic_shapes\Star_3f4b6682-2a94-11...,1
5,D:\datasets\basic_shapes\Heptagon_a01a5640-2a9...,4
6,D:\datasets\basic_shapes\Circle_140d7f4a-2a95-...,6
7,D:\datasets\basic_shapes\Hexagon_575963e8-2a97...,3
8,D:\datasets\basic_shapes\Heptagon_2e05562e-2a9...,4
9,D:\datasets\basic_shapes\Pentagon_87e33b1c-2a8...,7


In [26]:
row, _ = path_to_label.shape
split = config["split"]
train_size = int(row * split[0] / 100)
eval_size = int(row * split[1] / 100)
test_size = int(row * split[2] / 100) # In practice, we will just take the rest for test
print("Total Number of Images: {row_num}".format(row_num=row))
print("Train/Eval/Test Split: {Train}, {Eval}, {Test}".format(Train=train_size, Eval=eval_size, Test=test_size))
train_df = path_to_label.loc[0:train_size-1,:]
eval_df = path_to_label.loc[train_size:train_size+eval_size-1, :]
test_df = path_to_label.loc[train_size+eval_size:,:]
print("Train DataFrame Size: {train_row_size}".format(train_row_size=train_df.shape[0]))
print("Eval DataFrame Size: {eval_row_size}".format(eval_row_size=eval_df.shape[0]))
print("Test DataFrame Size: {test_row_size}".format(test_row_size=test_df.shape[0]))

Total Number of Images: 90000
Train/Eval/Test Split: 63000, 18000, 9000
Train DataFrame Size: 63000
Eval DataFrame Size: 18000
Test DataFrame Size: 9000


## Save Splited DataFrame

In [27]:
train_df.to_csv(os.path.join(exp_save_dir, "train_path_to_labels.csv"), index=False)
eval_df.to_csv(os.path.join(exp_save_dir, "eval_path_to_labels.csv"), index=False)
test_df.to_csv(os.path.join(exp_save_dir, "test_path_to_labels.csv"), index=False)