# Dublin AFL Preprocessing

This notebook contains the code to prepare the AFL data for training, processing it from full videos hosted on S3 in Sydney, to frames that only contain the ball hosted in Dublin.

We assume that we have a `.xml` file with the annotations already uploaded to S3

In [9]:
print("Let's begin:)")##

Let's begin:)


In [None]:
# Install libraries
!pip install imageio[pyav]
!pip install imageio[ffmpeg]
!pip install opencv-python

In [10]:
# Import libraries
import boto3
import cv2
import imageio
import os
import subprocess

from typing import List, Dict
from sagemaker_utils import *

In [11]:
# Define constants
AUS_AFL_BUCKET: str = "australia-fov/"
DUB_AFL_BUCKET: str = "dublin-afl-preprocessed/"

if os.name == 'nt':
    TMP_DIR: str = "test-marvel"
else:
    TMP_DIR: str = "tmp"

VIDEOS_TO_PROCESS: List[str] = [
    "marvel/marvel-fov-3/20_08_2023/time_04_09_06_date_20_08_2023_.avi",
    # "marvel-fov-1\\20_08_2023\\marvel_1_time_04_09_04_date_20_08_2023_.avi",
    # "marvel-fov-8\\26_08_2023\\marvel_8_time_09_09_04_date_27_08_2023_.avi"
]

First, let's inspect our buckets

In [None]:
# Ensure that each video path exists on s3
s3 = boto3.client('s3')

print("Checking video paths exist on S3...")
for video_path in VIDEOS_TO_PROCESS:
    try:
        s3.head_object(Bucket=AUS_AFL_BUCKET, Key=video_path)
        print(f"Video {video_path} exists on S3")
    except:
        print(f"Video {video_path} does not exist on S3!")

## Utility functions

Next, we define a bunch of helper function

Next, we'll define functions that we'll use to process the videos. We will only operate on one video at a time so we don't need too much storage space locally.

In [11]:
def clip_video_opencv(video_path: str, output_dir: str, clip_length: int = 60) -> None:
    """
    Clip a video into multiple segments of the given length using OpenCV.

    Parameters:
    - video_path: Path to the video file.
    - output_dir: Directory to store the clipped videos.
    - clip_length: Duration of each clip in seconds (default is 60 seconds).
    """

    # Open the video using OpenCV
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the number of frames needed for the specified clip length
    frames_per_clip = clip_length * fps

    # Create the output directory if it doesn't exist
    ensure_directory_exists(output_dir)

    # Get the video file name without the extension
    file_name = os.path.splitext(os.path.basename(video_path))[0]

    # Get the marvel caemra number
    camera_number = [string[-1] for string in os.path.split(video_path) if 'marvel-fov' in string][0]

    clip_num = 0
    while True:
        output_file = os.path.join(output_dir, f"marvel_{camera_number}_{file_name}{clip_num}.mp4")

        # Define video writer for output
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_file, fourcc, fps, (int(cap.get(3)), int(cap.get(4))))

        # Extract frames and write to new video
        for _ in range(frames_per_clip):
            ret, frame = cap.read()
            if not ret:
                break
            out.write(frame)

        # Release the current writer
        out.release()

        # If there were fewer frames left than half the desired clip length, we stop
        if ret and (total_frames - clip_num * frames_per_clip) < (0.5 * clip_length * fps):
            os.remove(output_file)
            break

        clip_num += 1

        # If we've reached the end of the video
        if not ret:
            break

    # Release video capture
    cap.release()


In [7]:
def extract_frames_from_video_cv2(file_path: str, fps: int = 30) -> None:
    """Given a video file path, extract frames using OpenCV (imageio is too slow for this)."""

    # Create a directory called "frames"
    file_name_without_extension = os.path.splitext(os.path.basename(file_path))[0]
    file_dir = os.path.dirname(file_path)

    frames_dir = create_directory(file_dir, directory_name=f'{file_name_without_extension}')

    if count_files_with_extension(frames_dir, '.png') > 1780:
        print(f'Frames already exist in {frames_dir}')
        return

    # Ensure the video path exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f'Video file not found at {file_path}')

    # Use OpenCV to read the video and save frames
    cap = cv2.VideoCapture(file_path)
    video_fps = int(cap.get(cv2.CAP_PROP_FPS))

    # We'll only save every nth frame to match the desired FPS (fps parameter)
    n = int(video_fps / fps)

    frame_num = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_num % n == 0:
            frame_file_path = os.path.join(frames_dir, f"frame_{frame_num // n:07d}.png")
            cv2.imwrite(frame_file_path, frame)
        frame_num += 1

    cap.release()
    print(f"Frames extracted to {frames_dir}")

In [6]:
import os
import boto3

AUS_AFL_BUCKET = "australia-fov"

def download_video_from_s3(video_path: str):
    """
    Download specified videos from the S3 bucket to local storage.
    If they already exist, ensure they have a reasonable size.

    Parameters:
    - video_path: Path of video to process.
    """
    # Create a S3 client
    s3 = boto3.client('s3')

    local_path = video_path.replace("\\", "/")

    # Ensure the directory structure exists for the local file
    if not os.path.exists(os.path.dirname(local_path)):
        os.makedirs(os.path.dirname(local_path))

    # Check if the file already exists in local storage
    if os.path.exists(local_path):
        # Check if file size is greater than 10 KB
        if os.path.getsize(local_path) > 10 * 1024:
            print(f"{video_path} already exists in local storage and has a valid size!")
            return
        else:
            print(f"{video_path} exists but is too small. Re-downloading...")
            os.remove(local_path)

    print(f"Downloading {video_path}...")
    try:
        # Download the file
        with open(local_path, 'wb') as f:
            s3.download_fileobj(AUS_AFL_BUCKET, video_path, f)
        print(f"{video_path} downloaded successfully!")
        print(f"File size in bytes: {get_file_size_in_bytes(local_path)}")
    except Exception as e:
        print(f"Error downloading {video_path}: {e}")
        if os.path.exists(local_path):
            os.remove(local_path)


In [7]:
# def remove_frames_without_ball(frame_dir: str)

## Processing time

In [8]:
for video_path in VIDEOS_TO_PROCESS:
    # Download the videos to local storage, but check if they exist already first
    # download_video_from_s3(video_path)

    # Clip video into 1 minute clips
    # clip_video_imageio(video_path, output_dir=TMP_DIR, clip_length=60)

    # Extract frames from clipped videos
    for video in find_files_with_ending(TMP_DIR, '.mp4'):
#         extract_frames_from_video_cv2(video)

        # frame_nums: List[int] = get_frames_containing_ball(XML_FILE)
        # file_names: List[str] = [get_frame_name_from_frame_number(frame_num) for frame_num in frame_nums]

    # Remove images that don't contain the ball
    # TODO: to be implemented


Found 1 files with ending .mp4 in test-marvel
Creating directory at test-marvel\1


KeyboardInterrupt: 