In [1]:
import pandas as pd
from pathlib import Path
import os
import random
mids_dir = Path("D:\\MIDS-W207")
data = mids_dir/"datasets/soccertrack"
project = mids_dir/"MIDS-W207-Spring24-Soccer-Detection"
analysis = project/"analysis"

# Author: Timothy Majidzadeh
# Date Created: March 12, 2024
# Date Updated: April 13, 2024
# Description: From the list of labels, randomly select a subset & split into train/val/test.
# Notes: [v1] Created program.
#        [v2-v4] Updated baselines for 80/10/10 split. Changed sample size. Re-ran after fixing an issue with the image extraction program.
# Inputs: Frame-by-frame labels data.
# Outputs: Text files listing the images included in the train/val/test splits for YOLO.

In [2]:

def sample_and_split(input_df, batch_size, desired_batches, train_pct=0.8, val_pct=0.1, test_pct=0.1, seed=39305):
    """
    Inputs:
        input_df: The list of images & their labels to sample from, an integer.
        batch_size: The number of images intended to process in each batch of the model training, an integer.
        train_pct, val_pct, test_pct: The train, val & test percentages, as floats from 0 to 1 adding to 1.
    Returns:
        train_df, val_df, test_df: The train, val & test data labels.
    """
    to_sample = batch_size * desired_batches
    train_val_index = round(to_sample * train_pct)
    val_test_index = train_val_index + round(to_sample * val_pct)
    sampled_df = input_df.copy().reset_index().sample(n=to_sample, replace=False, random_state=seed)
    return sampled_df.iloc[0:train_val_index,], sampled_df.iloc[train_val_index:val_test_index,], sampled_df.iloc[val_test_index:,] 


def train_val_test_paths(image_paths, set_type, vrsn_num, output_dir=data):
    """
    Create text files which give Ultralytics the paths for the train, val, and test images.
    Inputs:
        train, val, test: Numpy arrays or lists which are the absolute filepaths.
        set_type: 'train', 'val', or 'test', based on the input type.
        output_dir: A PathLib Path object.
    Outputs:
        Saves .txt files with paths to the selected train, val, and test sets.
    """
    output_dir_str = str(output_dir).replace("\\", "/")
    image_paths = list(image_paths.copy().str.replace("\\", "/").str.replace(str(data).replace("\\","/"), "."))
    with open(output_dir/"{}_v{}.txt".format(set_type, str(vrsn_num)), 'w') as f:
        i = 0
        for image_path in image_paths:
            if i == 0:
                f.write(image_path)
            else:
                f.write("\n"+image_path)
            i+=1
        f.close()

In [3]:
# Update to use full dataset when available.
top_view_labels = pd.read_pickle(data/"labels/top_view_labels_stacked/top_view_labels.pkl")
wide_view_labels = pd.read_pickle(data/"labels/wide_view_labels_stacked/wide_view_labels.pkl")
stacked_labels = pd.concat([top_view_labels, wide_view_labels])
labels_filtered = stacked_labels[(stacked_labels['frame_saved'] == True)]

In [4]:
train_df, val_df, test_df = sample_and_split(labels_filtered, 32, 325)

In [5]:
vrsn_num = 4

train_paths, val_paths, test_paths = train_df['frame_imgpath'], val_df['frame_imgpath'], test_df['frame_imgpath']
for path_list, set_type in zip([train_paths, val_paths, test_paths], ['train', 'val', 'test']):
    train_val_test_paths(path_list, set_type, vrsn_num)