# Dublin AFL Preprocessing

This notebook contains the code to prepare the AFL data for training, processing it from full videos hosted on S3 in Sydney, to frames that only contain the ball hosted in Dublin.

We assume that we have a `.xml` file with the annotations already uploaded to S3

In [None]:
# Install libraries
!pip install imageio

In [None]:
# Import libraries
import boto3
import cv2
import imageio
import os
import subprocess

from typing import List, Dict
from sagemaker_utils import *

In [None]:
# Define constants
AUS_AFL_BUCKET: str = "australia-fov/"
DUB_AFL_BUCKET: str = "dublin-afl-preprocessed/"

TMP_DIR: str = "tmp"

VIDEOS_TO_PROCESS: List[str] = [
    "marvel/marvel-fov-3/20_08_2023/time_04_09_06_date_20_08_2023_.avi",
    # "marvel-fov-1\\20_08_2023\\marvel_1_time_04_09_04_date_20_08_2023_.avi",
    # "marvel-fov-8\\26_08_2023\\marvel_8_time_09_09_04_date_27_08_2023_.avi"
]

First, let's inspect our buckets

In [None]:
# Ensure that each video path exists on s3
s3 = boto3.client('s3')

print("Checking video paths exist on S3...")
for video_path in VIDEOS_TO_PROCESS:
    try:
        s3.head_object(Bucket=AUS_AFL_BUCKET, Key=video_path)
        print(f"Video {video_path} exists on S3")
    except:
        print(f"Video {video_path} does not exist on S3!")

## Utility functions

Next, we define a bunch of helper function

Next, we'll define functions that we'll use to process the videos. We will only operate on one video at a time so we don't need too much storage space locally.

In [None]:
def clip_video_imageio(video_path: str, output_dir: str, clip_length: int = 60) -> None:
    """
        Clip a video into multiple segments of the given length using imageio.

        Parameters:
        - video_path: Path to the video file.
        - output_dir: Directory to store the clipped videos.
        - clip_length: Duration of each clip in seconds (default is 60 seconds).
        """
    reader = imageio.get_reader(video_path)
    fps = reader.get_meta_data()['fps']

    # Calculate the number of frames needed for the specified clip length
    frames_per_clip = int(clip_length * fps)

    # Create the output directory if it doesn't exist
    ensure_directory_exists(output_dir)

    num_frames = len(reader)

    for start_frame in range(0, num_frames, frames_per_clip):
        end_frame = min(start_frame + frames_per_clip, num_frames)

        # Define the output file name
        output_file = os.path.join(output_dir, f"clip_{start_frame // frames_per_clip}.mp4")

        # Check if this clip is too short (ignores the remainder)
        if end_frame - start_frame < clip_length * 0.5 * fps:
            break

        print(f"Clipping video {video_path} from frame {start_frame} to {end_frame}")

        # Write the segment to a new file
        writer = imageio.get_writer(output_file, fps=fps, macro_block_size=1)
        for frame in range(start_frame, end_frame):
            writer.append_data(reader.get_data(frame))
        writer.close()

        print(f"Finished clipping video segment {output_file}")

In [None]:
def extract_frames_from_video_cv2(file_path: str, fps: int = 30) -> None:
    """Given a video file path, extract frames using OpenCV (imageio is too slow for this)."""

    # Create a directory called "frames"
    file_name_without_extension = os.path.splitext(os.path.basename(file_path))[0]
    file_dir = os.path.dirname(file_path)

    parent_video_name = file_path.split(os.sep)[-2]
    frames_dir = create_directory(file_dir, directory_name=f'{parent_video_name}{file_name_without_extension}')

    if png_files_exist(frames_dir):
        print(f'Frames already exist in {frames_dir}')
        return

    # Use OpenCV to read the video and save frames
    cap = cv2.VideoCapture(file_path)
    video_fps = int(cap.get(cv2.CAP_PROP_FPS))

    # We'll only save every nth frame to match the desired FPS (fps parameter)
    n = int(video_fps / fps)

    frame_num = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_num % n == 0:
            frame_file_path = os.path.join(frames_dir, f"frame_{frame_num // n:07d}.png")
            cv2.imwrite(frame_file_path, frame)
        frame_num += 1

    cap.release()
    print(f"Frames extracted to {frames_dir}")


In [None]:
import os
import boto3

AUS_AFL_BUCKET = "australia-fov"

def download_video_from_s3(video_path: str):
    """
    Download specified videos from the S3 bucket to local storage.
    If they already exist, ensure they have a reasonable size.

    Parameters:
    - video_path: Path of video to process.
    """
    # Create a S3 client
    s3 = boto3.client('s3')

    local_path = video_path.replace("\\", "/")

    # Ensure the directory structure exists for the local file
    if not os.path.exists(os.path.dirname(local_path)):
        os.makedirs(os.path.dirname(local_path))

    # Check if the file already exists in local storage
    if os.path.exists(local_path):
        # Check if file size is greater than 10 KB
        if os.path.getsize(local_path) > 10 * 1024:
            print(f"{video_path} already exists in local storage and has a valid size!")
            return
        else:
            print(f"{video_path} exists but is too small. Re-downloading...")
            os.remove(local_path)

    print(f"Downloading {video_path}...")
    try:
        # Download the file
        with open(local_path, 'wb') as f:
            s3.download_fileobj(AUS_AFL_BUCKET, video_path, f)
        print(f"{video_path} downloaded successfully!")
        print(f"File size in bytes: {get_file_size_in_bytes(local_path)}")
    except Exception as e:
        print(f"Error downloading {video_path}: {e}")
        if os.path.exists(local_path):
            os.remove(local_path)


## Processing time

In [None]:
for video_path in VIDEOS_TO_PROCESS:
    # Download the videos to local storage, but check if they exist already first
    download_video_from_s3(video_path)

    # Clip video into 1 minute clips
    clip_video_imageio(video_path, output_dir=TMP_DIR, clip_length=60)

    # Extract frames from clipped videos
    for video in find_files_with_ending(TMP_DIR, '.mp4'):
        extract_frames_from_video_cv2(video)

    # Remove images that don't contain the ball
    # TODO: to be implemented
