# 0. Setup

In [None]:
import os

In [None]:
paths = {
    'IMAGES_PATH' : os.path.join('workspace', 'images', 'all'),
    'IMAGES_ANNOTATED_PATH' : os.path.join('workspace', 'images', 'all','annotated'),
    'TRAIN_PATH' : os.path.join('workspace', 'images', 'train'),
    'EVAL_PATH' : os.path.join('workspace', 'images', 'eval'),
    'TEST_PATH' : os.path.join('workspace', 'images', 'test')
 }

In [None]:
for path in paths.values():
    if not os.path.exists(path):
        if os.name == 'posix':
            !mkdir -p {path}
        if os.name == 'nt':
            !mkdir {path}

# 1. Move or download images in the correct folder

Transfer images to be used in "workspace/raw_data" folder

# 2. Label images using LabelImg (optional)

In [None]:
! pip install labelImg

In [None]:
# Launch the following command in a terminal in the 'Object Detection' folder
print("labelImg {}".format(paths['IMAGES_PATH']))

# 3. Move annotated images in a predefined folder for splitting

In [None]:
from utils.annotations import moveAnnotatedImages
moveAnnotatedImages()

# 3.a. Split dataset using the Hold-out method (see KFoldCV files to use the KFold CV approach)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import shutil
import os

eval_size = 0

# remove all files already present for an old holdout method execution
for file in os.listdir(paths["TRAIN_PATH"]):
    if file != ".DS_Store":
        os.remove(os.path.join(paths["TRAIN_PATH"], file))
for file in os.listdir(paths["TEST_PATH"]):
    if file != ".DS_Store":
        os.remove(os.path.join(paths["TEST_PATH"], file))
for file in os.listdir(paths["EVAL_PATH"]):
    if file != ".DS_Store":
        os.remove(os.path.join(paths["EVAL_PATH"], file))

# collect all annotated images except those of user B and K
df = pd.DataFrame()
for image in os.listdir(paths["IMAGES_ANNOTATED_PATH"]):
    new_row = { 'Path': image }
    df = df.append(new_row, ignore_index=True)

# split in the training set (90%) and a validation set (10%)
X_train, X_eval, _, _ = train_test_split(df, df, test_size=0.1, shuffle=True)

# split the training images into the final training set (80%) and add the rest (20%) to the test set
X_train, X_test, _, _ = train_test_split(X_train, X_train, test_size=0.8, shuffle=True)

print("Train size: " + str(len(X_train)))
print("Test size: " + str(len(X_test)))
print("Eval size: " + str(eval_size))

for _, row in X_eval.iterrows():
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path']),
                os.path.join(paths["EVAL_PATH"],row['Path']))
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path'].split(".jpg")[0]+".xml"),
                os.path.join(paths["EVAL_PATH"],row['Path'].split(".jpg")[0]+".xml"))

for _, row in X_eval.iterrows():
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path']),
                os.path.join(paths["EVAL_PATH"],row['Path']))
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path'].split(".jpg")[0]+".xml"),
                os.path.join(paths["EVAL_PATH"],row['Path'].split(".jpg")[0]+".xml"))

for _, row in X_test.iterrows():
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path']),
                os.path.join(paths["TEST_PATH"],row['Path']))
    shutil.copy(os.path.join(paths["IMAGES_ANNOTATED_PATH"],row['Path'].split(".jpg")[0]+".xml"),
                os.path.join(paths["TEST_PATH"],row['Path'].split(".jpg")[0]+".xml"))

# 4. Compress them for moving them for Colab training (optional)

In [None]:
ARCHIVE_PATH = os.path.join('workspace', 'images', 'archive.tar.gz')

In [None]:
# compress train, validation and test set in a compressed format
!tar -czf {ARCHIVE_PATH} {paths["TRAIN_PATH"]} {paths["EVAL_PATH"]} {paths["TEST_PATH"]}