# Dublin AFL Preprocessing

This notebook contains the code to prepare the AFL data for training, processing it from full videos hosted on S3 in Sydney, to frames that only contain the ball hosted in Dublin.

We assume that we have a `.xml` file with the annotations already uploaded to S3

In [None]:
# Import libraries
import boto3
import cv2
import os
import subprocess

from typing import List, Dict
from sagemaker_utils import *

In [None]:
# Define constants
AUS_AFL_BUCKET: str = "australia-fov/"
DUB_AFL_BUCKET: str = "dublin-afl-preprocessed/"

TMP_DIR: str = "tmp"

VIDEOS_TO_PROCESS: List[str] = [
    "marvel/marvel-fov-3/20_08_2023/time_04_09_06_date_20_08_2023_.avi",
    # "marvel-fov-1\\20_08_2023\\marvel_1_time_04_09_04_date_20_08_2023_.avi",
    # "marvel-fov-8\\26_08_2023\\marvel_8_time_09_09_04_date_27_08_2023_.avi"
]

First, let's inspect our buckets

In [None]:
# Ensure that each video path exists on s3
s3 = boto3.client('s3')

print("Checking video paths exist on S3...")
for video_path in VIDEOS_TO_PROCESS:
    try:
        s3.head_object(Bucket=AUS_AFL_BUCKET, Key=video_path)
        print(f"Video {video_path} exists on S3")
    except:
        print(f"Video {video_path} does not exist on S3!")

## Utility functions

Next, we define a bunch of helper function

Next, we'll define functions that we'll use to process the videos. We will only operate on one video at a time so we don't need too much storage space locally.

In [None]:

def clip_video(video, output_dir: str, clip_length: int = 60) -> None:
    """
    This function clips a video into multiple 60 second long clips.
    We use FFmpeg directly to ensure lossless clipping.
    """

    # Check if the output_directory exists, and if not, make it
    ensure_directory_exists(output_dir)

    # Load the video and get its length
    cv2_video = cv2.VideoCapture(video)
    video_length: int = int(cv2_video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps: int = int(cv2_video.get(cv2.CAP_PROP_FPS))

    # Get the number of clips we need to make
    num_clips: int = (video_length // fps) // clip_length

    # Get the remainder
    remainder: int = video_length % clip_length

    # Get the start and end times for each clip
    start_times: List[int] = [i * clip_length for i in range(num_clips)]

    # If there is a remainder, add it to the end of the list
    if remainder > 0:
        start_times.append(num_clips * clip_length)

    # Get the end times
    end_times: List[int] = [i + clip_length for i in start_times]

    # Get the output names
    output_names: List[str] = [output_dir + "\\" + str(i) + ".mp4" for i in range(len(start_times))]

    # Clip the video
    for i in range(len(start_times)):
        print(f"Clipping video {video} from {start_times[i]} to {end_times[i]}")

        # Craft the FFmpeg command for lossless clipping
        cmd = [
            'ffmpeg',
            '-i', video,
            '-ss', str(start_times[i]),
            '-t', str(clip_length),
            '-c:v', 'copy',
            '-an',  # This excludes the audio. If you want to include audio, you can remove this.
            output_names[i]
        ]

        subprocess.run(cmd)

        print(f"Finished clipping video {video} from {start_times[i]} to {end_times[i]}")

In [None]:
def extract_frames_from_video(file_path: str) -> None:
    """Given a .avi video file path, extract frames using ffmpeg."""

    # Create a directory called "frames"
    file_name_without_extension = os.path.splitext(os.path.basename(file_path))[0]
    file_dir = os.path.dirname(file_path)

    parent_video_name = file_path.split('\\')[-2]
    frames_dir = create_directory(file_dir, directory_name=f'{parent_video_name}{file_name_without_extension}')

    if png_files_exist(frames_dir):
        print(f'Frames already exist in {frames_dir}')
        return

    # Construct the ffmpeg command
    cmd = [
        'ffmpeg',
        '-i', file_path,
        '-vf', 'fps=30/1',
        '-start_number', '0',
        f'{frames_dir}\\frame_%07d.png'
    ]

    # Execute the command via subprocess
    print(cmd)
    subprocess.run(cmd)

In [None]:
import os
import boto3

AUS_AFL_BUCKET = "australia-fov"

def download_video_from_s3(video_path: str):
    """
    Download specified videos from the S3 bucket to local storage.
    If they already exist, ensure they have a reasonable size.

    Parameters:
    - video_path: Path of video to process.
    """
    # Create a S3 client
    s3 = boto3.client('s3')

    local_path = video_path.replace("\\", "/")

    # Ensure the directory structure exists for the local file
    if not os.path.exists(os.path.dirname(local_path)):
        os.makedirs(os.path.dirname(local_path))

    # Check if the file already exists in local storage
    if os.path.exists(local_path):
        # Check if file size is greater than 10 KB
        if os.path.getsize(local_path) > 10 * 1024:
            print(f"{video_path} already exists in local storage and has a valid size!")
            return
        else:
            print(f"{video_path} exists but is too small. Re-downloading...")
            os.remove(local_path)

    print(f"Downloading {video_path}...")
    try:
        # Download the file
        with open(local_path, 'wb') as f:
            s3.download_fileobj(AUS_AFL_BUCKET, video_path, f)
        print(f"{video_path} downloaded successfully!")
        print(f"File size in bytes: {get_file_size_in_bytes(local_path)}")
    except Exception as e:
        print(f"Error downloading {video_path}: {e}")
        if os.path.exists(local_path):
            os.remove(local_path)


## Processing time

In [None]:
for video_path in VIDEOS_TO_PROCESS:
    # Download the videos to local storage, but check if they exist already first
    download_video_from_s3(video_path)

    # Clip video into 1 minute clips
    clip_video(video_path, output_dir=TMP_DIR, clip_length=60)

    # Extract frames from clipped videos
    for video in find_files_with_ending(TMP_DIR, '.mp4'):
        extract_frames_from_video(video)

    # Remove images that don't contain the ball
    # TODO: to be implemented
