In [1]:
import pandas as pd
from pathlib import Path
import os
import random
mids_dir = Path("D:\\MIDS-W207")
data = mids_dir/"datasets/soccertrack_square"
project = mids_dir/"MIDS-W207-Spring24-Soccer-Detection"
analysis = project/"analysis"

# Author: Timothy Majidzadeh
# Date Created: March 12, 2024
# Date Updated: April 7, 2024
# Description: From the list of labels, randomly select a subset & split into train/test.
# Notes: [v1] Created program.
# Inputs: Frame-by-frame image data & labels.
# Outputs: The data prepared for use in YOLO using Ultralytics.

In [2]:

def sample_and_split(input_df, to_sample, seed, train_pct=0.8, val_pct=0.1, test_pct=0.1):
    """
    Inputs:
        input_df: The list of images & their labels to sample from, an integer.
        to_sample: The number of images intended to sample.
        train_pct, val_pct, test_pct: The train, val & test percentages, as floats from 0 to 1 adding to 1.
    Returns:
        train_df, val_df, test_df: The train, val & test data labels.
    """
    train_val_index = round(to_sample * train_pct)
    val_test_index = train_val_index + round(to_sample * val_pct)
    sampled_df = input_df.copy().reset_index().sample(n=to_sample, replace=False, random_state=seed)
    return sampled_df.iloc[0:train_val_index,], sampled_df.iloc[train_val_index:val_test_index,], sampled_df.iloc[val_test_index:,] 

def train_val_test_paths(image_paths, set_type, vrsn_num, output_dir=data):
    """
    Create text files which give Ultralytics the paths for the train, val, and test images.
    Inputs:
        train, val, test: Numpy arrays or lists which are the absolute filepaths.
        set_type: 'train', 'val', or 'test', based on the input type.
        vrsn_num: A version number in string format, for saving multiple splits if needed.
        output_dir: A PathLib Path object.
    Outputs:
        Saves .txt files with paths to the selected train, val, and test sets.
    """
    image_paths = image_paths.str.replace('\\', '/')
    with open(output_dir/"splits/{}_v{}.txt".format(set_type, vrsn_num), 'w') as f:
        i = 0
        for image_path in image_paths:
            if i == 0:
                f.write(image_path)
            else:
                f.write("\n"+image_path)
            i+=1
        f.close()

In [3]:
vrsn_num = "1"
srs, oversampled = pd.read_pickle(data/"objects_per_image_srs.pkl"), pd.read_pickle(data/"objects_per_image_oversampled.pkl")

In [4]:
srs_train, srs_val, srs_test = sample_and_split(srs, 10400, 368750)
train_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in srs_train['image_name'])
val_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in srs_val['image_name'])
test_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in srs_test['image_name'])
train_val_test_paths(train_paths, 'srs_train', vrsn_num, data)
train_val_test_paths(val_paths, 'srs_val', vrsn_num, data)
train_val_test_paths(test_paths, 'srs_test', vrsn_num, data)

In [5]:
oversampled_train, oversampled_val, oversampled_test = sample_and_split(oversampled, 10400, 62491)
train_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in oversampled_train['image_name'])
val_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in oversampled_val['image_name'])
test_paths = pd.Series(str(data) + f'/images/{image_name}' for image_name in oversampled_test['image_name'])
train_val_test_paths(train_paths, 'oversampled_train', vrsn_num, data)
train_val_test_paths(val_paths, 'oversampled_val', vrsn_num, data)
train_val_test_paths(test_paths, 'oversampled_test', vrsn_num, data)