Split the data into sections based on the class : NOTHING, PASS_L, MAUL ETC (easy based on the frame counts in the RLE classification) and save each section as a separate video file: PASS_L_01,avi, PASS_L_02.mp3 etc etc.

THIS Needs a code review as any off-by-one error in calculating clips start/ends will accumulate of clips, and at 1000's of clips it could be massively off by the end!

The clip extarction method is block-sequential i.e. Clip 1 might be frames 1-35, clip 2 frames 36-70 etc etc.

We could make it a running block:
Clip 1: 1-35
Clip 2: 2-36 etc etc

Any clips short than the given lentgh are dumped - for long desired clip lentghs this can easily dump whole clips - particulalrly clips from classes that are inherant short (e.g. PASS).

Any residual frames from clips lomger than the desired clip length are similarly dumped.


In [1]:
import cv2
import numpy as np
# from google.colab.patches import cv2_imshow
import re
import os
import random
import shutil

In [2]:
def getVideoFrames(filepath,start,length):
  '''
    Function to return a set length extract of a video file starting from a given frame
    Parameters:
    filepath (str): path and file name of the video from which to extract a clip
    start (int): frame number to start the clip at
    length (int): length in frames of the desired clip

    Returns:
    list(np,array): a list of frames from the video as np.arrays
    tuple(int,int) : (width, height) of the captured clip
    fps (float): fps of the captured clip
  '''
  cap = cv2.VideoCapture(filepath)
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH ))
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT ))
  fps =  int(cap.get(cv2.CAP_PROP_FPS))
  #print(f"Width : {width} Height: {height} FPS: {fps}")
  cap.set(cv2.CAP_PROP_POS_FRAMES, start-1)
  frames_list=list()
  for frame_count in range(start,start+length):
    ret, frame = cap.read()
    frames_list.append(frame)
  cap.release()
  return frames_list, (width,height), fps

In [3]:
def getClipDetails(filepath):
  '''
    Function to Parse the label file into list of labels and run lengths
    Parameters:
    filepath (str): path and file name of the label file
    Returns:
    list(tuple(str,int)): a list of tuples of (label,frame length)
  '''
  with open(filepath ) as f:
      lines = f.readlines()

  label_tuples = list()
  for line in lines:
    #print(line)
    label_match = re.search(r'(?<=Label.)(.*)(?=:)',line)
    #print(label_match.group())
    frame_match = re.search(r'(?<=:)(.*)(?=\n)',line)
    if frame_match is None:
      frame_match = re.search(r'(?<=:)(.*)',line)
    #print(frame_match.group())
    label_tuples.append((label_match.group(),int(frame_match.group())))
  return label_tuples

In [4]:
# Define the location of the label file
labelpath = "/dcs/large/u1901447/labels/gallconcat.lbl"
# Define the location of the video we are processing
videopath = "/dcs/large/u1901447/videos/220611galleivnor_2_movie-001.mov"
# Define the folder we want to save the extracted fragments into
save_path_root = '/dcs/large/u1901447/videos/clips'
label_tuples = getClipDetails(labelpath)

In [5]:
# Create folders to store the extracted clips

all_labels = set(l[0] for l in label_tuples)
for label in all_labels:
  if not os.path.exists(f"{save_path_root}/all_35_frames/{label}"):
      # if the demo_folder directory is not presenth 
      # then create it.
      os.makedirs(f"{save_path_root}/all_35_frames/{label}")

Here we cut the long video into smaller clips based on the classification made in in the labels file. They are given filenames incrementally as the segentation progresses. Clips with less than 15 frames are dropped.

Note that if this program is run on different files the filenames WILL be overwritten!

In [6]:
# Setup a dict to store the counts of files of each label so we can have a unique file name for each type of label
# label_indexer = dict()
# Get an iterable of unique labels
all_labels = set(l[0] for l in label_tuples)
for label in all_labels:
  if not os.path.exists(f"{save_path_root}/all_35_frames/{label}"):
      # if the demo_folder directory is not present 
      # then create it.
      os.makedirs(f"{save_path_root}/all_35_frames/{label}")
# To incrementaly number the saved clips with unique file names
file_counter = 1
# To point to which frame we are currently at for the extraction function
frame_counter = 1
clip_length = 35

# Loop through the RLE data calling the frame extractor function, then saving the extracted clips to a new MP4 file
for label,frames_count in label_tuples:
    print(f"Processing a run of {frames_count} frames with label {label}")
    frames_to_process = frames_count
    # if the clip length is greater than required, split it into multiple clips
    while frames_to_process > clip_length:
        #  Get 'clip_length' frames from the given video
        frames, videodims, fps = getVideoFrames(videopath,frame_counter,clip_length)
        #print(f"Video Dims : {videodims} FPS: {fps}")
        videocap_name = f"{save_path_root}/all_{clip_length}_frames/{label}/{label}_{file_counter}.mp4"
        # increment the counter fot the number of clips processed so far (so each gets a un ique name)
        file_counter+=1
        out = cv2.VideoWriter(videocap_name, cv2.VideoWriter_fourcc(*'mp4v'), fps, videodims)
        for frame in frames:
            out.write(frame)
        out.release()
        print(f"Saved filename {videocap_name} with label {label} and {clip_length} frames from {frame_counter} to {frame_counter+clip_length}")
        # update the count of remaining frames to process in this clip
        frames_to_process -= clip_length
        print(f"There are {frames_to_process} frames left in this clip")
        # move the start position on by 'clip_length' positions
        frame_counter += clip_length

    print(f"Dropped label a run of {label} with {frames_to_process} frames from {frame_counter} to {frame_counter+frames_to_process}")
    # Step the frame counter on to be ready for the next read
    frame_counter += frames_to_process


Processing a run of 249 frames with label NOTHING
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_1.mp4 with label NOTHING and 35 frames from 1 to 36
There are 214 frames left in this clip
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_2.mp4 with label NOTHING and 35 frames from 36 to 71
There are 179 frames left in this clip
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_3.mp4 with label NOTHING and 35 frames from 71 to 106
There are 144 frames left in this clip
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_4.mp4 with label NOTHING and 35 frames from 106 to 141
There are 109 frames left in this clip
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_5.mp4 with label NOTHING and 35 frames from 141 to 176
There are 74 frames left in this clip
Saved filename /dcs/large/u1901447/videos/clips/all_35_frames/NOTHING/NOTHING_6.mp4 with label NO

KeyboardInterrupt: 

This is a temporary solution that should be embedded in the code above.
It copied files generated above into separate train/test/validation folders - and subfolders for each class inside those.
This seems to be the defacto standard for tf/keras

In [7]:
clips_dir = r'/dcs/large/u1901447/videos/clips/all_35_frames'
dest_dir = r'/dcs/large/u1901447/videos/clips/data_35_frames'
# Make the folders if they dont exist
if not os.path.isdir(f"{dest_dir}"):
    os.mkdir(f"{dest_dir}")
if not os.path.isdir(f"{dest_dir}/val"):
    os.mkdir(f"{dest_dir}/val")
if not os.path.isdir(f"{dest_dir}/train"):
  os.mkdir(f"{dest_dir}/train")
if not os.path.isdir(f"{dest_dir}/test/{dir}"):
  os.mkdir(f"{dest_dir}/test")
# Define the proportions required in each collection
val_prop = 0.2
test_prop = 0.2
train_prop = 0.6
# Iterate the main directory folder
for root, dirs, files in os.walk(clips_dir):
    for dir in dirs:
      # Ignore any folders hidden with leading'.'
      if dir[0] == '.':
        continue
      else:
        # Iterate each Class folder, extracting a random set of file names from the folder
        # Val first, then test, and all that are left go into train
        for root2, dirs2, files2 in os.walk(f"{clips_dir}/{dir}"):
          full_files_set = set(files2)
          print(full_files_set)
          val_files = set(random.sample(full_files_set, int(len(full_files_set)*val_prop)))
          
          print(val_files)
          if not os.path.isdir(f"{dest_dir}/val/{dir}"):
            os.mkdir(f"{dest_dir}/val/{dir}")
          for file in val_files:
            # Source path
            source = f"{clips_dir}/{dir}/{file}"
            # Destination path
            destination = f"{dest_dir}/val/{dir}/{file}"
            # Move the content of source to destination
            dest = shutil.move(source, destination)
          rem_files = full_files_set - val_files
          test_files = set(random.sample(rem_files, int(len(full_files_set)*test_prop)))
          print(test_files)
          if not os.path.isdir(f"{dest_dir}/test/{dir}"):
            os.mkdir(f"{dest_dir}/test/{dir}")
          for file in test_files:
            # Source path
            source = f"{clips_dir}/{dir}/{file}"
            # Destination path
            destination = f"{dest_dir}/test/{dir}/{file}"
            # Move the content of source to destination
            dest = shutil.move(source, destination)
          train_files = rem_files - test_files
          print(train_files)
          if not os.path.isdir(f"{dest_dir}/train/{dir}"):
            os.mkdir(f"{dest_dir}/train/{dir}")
          for file in train_files:
            # Source path
            source = f"{clips_dir}/{dir}/{file}"
            # Destination path
            destination = f"{dest_dir}/train/{dir}/{file}"
            # Move the content of source to destination
            dest = shutil.move(source, destination)


{'TACKLE_S_D_2803.mp4', 'TACKLE_S_D_2802.mp4', 'TACKLE_S_D_3832.mp4', 'TACKLE_S_D_1667.mp4', 'TACKLE_S_D_633.mp4'}
{'TACKLE_S_D_2803.mp4'}
{'TACKLE_S_D_3832.mp4'}
{'TACKLE_S_D_633.mp4', 'TACKLE_S_D_2802.mp4', 'TACKLE_S_D_1667.mp4'}
{'PASS_L_1230.mp4', 'PASS_L_3031.mp4', 'PASS_L_1839.mp4', 'PASS_L_3829.mp4', 'PASS_L_1849.mp4', 'PASS_L_820.mp4'}
{'PASS_L_3031.mp4'}
{'PASS_L_3829.mp4'}
{'PASS_L_1230.mp4', 'PASS_L_820.mp4', 'PASS_L_1839.mp4', 'PASS_L_1849.mp4'}
{'CARRY_1795.mp4', 'CARRY_129.mp4', 'CARRY_1853.mp4', 'CARRY_3671.mp4', 'CARRY_3747.mp4', 'CARRY_2960.mp4', 'CARRY_2991.mp4', 'CARRY_638.mp4', 'CARRY_719.mp4', 'CARRY_1089.mp4', 'CARRY_1662.mp4', 'CARRY_465.mp4', 'CARRY_3354.mp4', 'CARRY_718.mp4', 'CARRY_2891.mp4', 'CARRY_796.mp4', 'CARRY_3356.mp4', 'CARRY_1649.mp4', 'CARRY_3571.mp4', 'CARRY_2861.mp4', 'CARRY_4257.mp4', 'CARRY_406.mp4', 'CARRY_3243.mp4', 'CARRY_21.mp4', 'CARRY_1095.mp4', 'CARRY_2862.mp4', 'CARRY_474.mp4', 'CARRY_717.mp4', 'CARRY_3257.mp4', 'CARRY_3039.mp4', 'CARRY_2

since Python 3.9 and will be removed in a subsequent version.
  val_files = set(random.sample(full_files_set, int(len(full_files_set)*val_prop)))
since Python 3.9 and will be removed in a subsequent version.
  test_files = set(random.sample(rem_files, int(len(full_files_set)*test_prop)))


{'TACKLE_S_3026.mp4', 'TACKLE_S_3445.mp4', 'TACKLE_S_2815.mp4', 'TACKLE_S_4088.mp4', 'TACKLE_S_229.mp4', 'TACKLE_S_810.mp4', 'TACKLE_S_4377.mp4', 'TACKLE_S_3129.mp4', 'TACKLE_S_2896.mp4', 'TACKLE_S_4398.mp4', 'TACKLE_S_987.mp4', 'TACKLE_S_1094.mp4', 'TACKLE_S_22.mp4', 'TACKLE_S_4325.mp4', 'TACKLE_S_3247.mp4', 'TACKLE_S_1096.mp4', 'TACKLE_S_976.mp4', 'TACKLE_S_2993.mp4', 'TACKLE_S_635.mp4', 'TACKLE_S_998.mp4', 'TACKLE_S_4330.mp4', 'TACKLE_S_804.mp4', 'TACKLE_S_798.mp4', 'TACKLE_S_797.mp4', 'TACKLE_S_1854.mp4', 'TACKLE_S_3969.mp4', 'TACKLE_S_561.mp4', 'TACKLE_S_1970.mp4', 'TACKLE_S_1358.mp4', 'TACKLE_S_10.mp4', 'TACKLE_S_611.mp4', 'TACKLE_S_1840.mp4', 'TACKLE_S_608.mp4', 'TACKLE_S_1843.mp4', 'TACKLE_S_4085.mp4', 'TACKLE_S_2889.mp4', 'TACKLE_S_1090.mp4', 'TACKLE_S_3966.mp4', 'TACKLE_S_2869.mp4', 'TACKLE_S_4395.mp4', 'TACKLE_S_29.mp4', 'TACKLE_S_46.mp4', 'TACKLE_S_30.mp4', 'TACKLE_S_2876.mp4', 'TACKLE_S_33.mp4', 'TACKLE_S_1099.mp4', 'TACKLE_S_4084.mp4', 'TACKLE_S_1574.mp4', 'TACKLE_S_799.m