In [None]:
import os
import glob
import pandas as pd
import shutil
import wandb

from sklearn.model_selection import train_test_split

In [None]:
def read_label_file(label_file_path, column_names=['class', 'x_center', 'y_center', 'width', 'height']):
    label_data = pd.read_csv(label_file_path, names=column_names, sep=' ')
    
    # add column containing the label file path and re-order columns
    label_file_path = label_file_path
    label_data['label_path'] = label_file_path.replace('\\', '/')
    label_data['image_path'] = label_data['label_path'].apply(get_image_path)
    label_data = label_data[['label_path', 'image_path']+column_names]
    return label_data

def get_image_path(label_file_path):
    image_path = f'./data/images/{label_file_path[-11:-4]}.jpg'
    return image_path

def clear_folder(path):
    shutil.rmtree(path)
    os.mkdir(path)

def clear_train_validation_subfolders():
    subfolders = ['./data/train/images/', './data/train/labels/',
                  './data/validation/images/', './data/validation/labels/']
    for subfolder in subfolders:
        clear_folder(subfolder)

def copy_from_source_path(source_path, is_train=True):
    # copy image or labels to train or validation folder
    folder = 'train' if is_train else 'validation'
    destination_path = source_path.replace('./data', f'./data/{folder}')
    shutil.copy(source_path, destination_path)
    
def copy_labels_and_images(data, is_train=True):
    data['label_path'].apply(lambda path: copy_from_source_path(path, is_train=is_train))
    data['image_path'].apply(lambda path: copy_from_source_path(path, is_train=is_train))

In [None]:
# add pre-split data to artifact
run = wandb.init(project='spiders-and-hounds', job_type='upload')
data_artifact = wandb.Artifact('mob_data', type='dataset')
data_artifact.add_dir('./data/images', name='images')
data_artifact.add_dir('./data/labels', name='labels')

In [None]:
# skip first file containing the classes 
label_file_paths = glob.glob('./data/labels/*.txt')[1:]

In [None]:
# combine dataframes containing the annotation information for each image
label_data_list = [read_label_file(label_file_path) for label_file_path in label_file_paths]
label_data = pd.concat(label_data_list)

In [None]:
class_counts_by_file = label_data.groupby(['label_path', 'image_path', 'class']).aggregate({'class': 'count'})
class_counts_by_file = class_counts_by_file.unstack(fill_value=0)
class_counts_by_file.columns = class_counts_by_file.columns.droplevel()
class_counts_by_file.columns.name = None
class_counts_by_file.reset_index(inplace=True)

In [None]:
class_counts_by_file[['contains_class_0', 'contains_class_1']] = (class_counts_by_file[[0, 1]] > 0).astype(int)

In [None]:
train_data, validation_data = train_test_split(class_counts_by_file, 
                                               test_size=0.25,
                                               stratify=class_counts_by_file[['contains_class_0', 'contains_class_1']],
                                               random_state=42)

In [None]:
# total number of spiders (0) vs hounds (1) 
display(class_counts_by_file[[0, 1]].sum(axis=0))

# spiders and hounds in the training set
display(train_data[[0, 1]].sum(axis=0))

# spiders and hounds in the validation set
display(validation_data[[0, 1]].sum(axis=0))

In [None]:
# clear train/validation image and label folders before copying to ensure that new splits do not result in data leakage
# when additional data is added
clear_train_validation_subfolders()

copy_labels_and_images(train_data)
copy_labels_and_images(validation_data, is_train=False)

In [None]:
# add train and validation splits to the artifact and log the artifact
data_artifact.add_dir('./data/train', name='train')
data_artifact.add_dir('./data/validation', name='validation')
run.log_artifact(data_artifact)
run.finish()