In [1]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import random

mids_dir = Path("D:\\MIDS-W207")
data = mids_dir/"datasets/soccertrack_square"
project = mids_dir/"MIDS-W207-Spring24-Soccer-Detection"
analysis = project/"analysis"

# Author: Timothy Majidzadeh
# Date Created: April 6, 2024
# Date Updated: April 6, 2024
# Description: Over-sample the soccertrack_square dataset so that the ball is represented at a higher rate.
# Notes: [v1] Created program. Using fractions: 0.5 with ball, 0.3 with players & no ball, 0.2 with no objects, 50-50 top view / wide view split.
# Inputs: A table listing the soccertrack_square images and the number of objects in each image.
# Outputs: An appended DataFrame which randomly selects and shuffles input images for use in machine learning models, with the ball included at a higher rate.

In [2]:
objects_per_image = pd.read_pickle(data/"count_objects_per_image.pkl")
stacked_labels = pd.read_pickle(data/"stacked_labels.pkl")
ball_fraction, empty_fraction, sample_size = 0.5, 0.2, 10400
players_fraction = 1 - ball_fraction - empty_fraction

In [3]:
top_view_ball = objects_per_image.query('img_ball_count > 0 & image_name.str.contains("top_view")')
wide_view_ball = objects_per_image.query('img_ball_count > 0 & image_name.str.contains("wide_view")')
top_view_players = objects_per_image.query('(img_team_0_count > 0 | img_team_1_count > 0) & img_ball_count == 0 & image_name.str.contains("top_view")')
wide_view_players = objects_per_image.query('(img_team_0_count > 0 | img_team_1_count > 0) & img_ball_count == 0 & image_name.str.contains("wide_view")')
top_view_empty = objects_per_image.query('img_team_0_count == 0 & img_team_1_count == 0 & img_ball_count == 0 & image_name.str.contains("top_view")')
wide_view_empty = objects_per_image.query('img_team_0_count == 0 & img_team_1_count == 0 & img_ball_count == 0 & image_name.str.contains("wide_view")')

In [4]:
top_view_ball_subsample = top_view_ball.sample(int(sample_size * ball_fraction * 0.5), random_state = 79641)
wide_view_ball_subsample = wide_view_ball.sample(int(sample_size * ball_fraction * 0.5), random_state = 358645)
top_view_players_subsample = top_view_players.sample(int(sample_size * players_fraction * 0.5), random_state = 691901)
wide_view_players_subsample = wide_view_players.sample(int(sample_size * players_fraction * 0.5), random_state = 165333)
top_view_empty_subsample = top_view_empty.sample(int(sample_size * empty_fraction * 0.5), random_state = 344226)
wide_view_empty_subsample = wide_view_empty.sample(int(sample_size * empty_fraction * 0.5), random_state = 928911)
objects_per_image_oversampled = pd.concat(
    [top_view_ball_subsample, wide_view_ball_subsample, top_view_players_subsample, wide_view_players_subsample, top_view_empty_subsample, wide_view_empty_subsample],
).sample(frac=1, replace = False)
objects_per_image_oversampled

class,image_name,img_ball_count,img_team_0_count,img_team_1_count
2632,top_view_12366.png,0,3,1
50828,top_view_7846.png,0,1,1
71332,wide_view_26299.png,1,9,9
98211,wide_view_7900.png,0,0,0
84552,wide_view_38197.png,0,1,3
...,...,...,...,...
44026,top_view_49621.png,0,0,0
44455,top_view_50006.png,1,9,5
86315,wide_view_39784.png,0,4,2
41307,top_view_47174.png,0,0,1


In [6]:
objects_per_image_oversampled[['img_ball_count', 'img_team_0_count', 'img_team_1_count']].agg(sum)

  objects_per_image_oversampled[['img_ball_count', 'img_team_0_count', 'img_team_1_count']].agg(sum)


class
img_ball_count       5200
img_team_0_count    31347
img_team_1_count    31441
dtype: int64

In [7]:
objects_per_image[['img_ball_count', 'img_team_0_count', 'img_team_1_count']].agg(sum)

  objects_per_image[['img_ball_count', 'img_team_0_count', 'img_team_1_count']].agg(sum)


class
img_ball_count       11823
img_team_0_count    153204
img_team_1_count    155079
dtype: int64

In [8]:
oversampled_images = list(objects_per_image_oversampled['image_name'].unique())
stacked_labels_oversampled = stacked_labels[stacked_labels['image_name'].isin(oversampled_images)]

In [9]:
stacked_labels_oversampled.to_csv(data/"stacked_labels_oversampled.csv")
stacked_labels_oversampled.to_pickle(data/"stacked_labels_oversampled.pkl")

objects_per_image_oversampled.to_csv(data/"objects_per_image_oversampled.csv")
objects_per_image_oversampled.to_pickle(data/"objects_per_image_oversampled.pkl")