**PIPELINE:**

In this notebook:
1. Raw video -> labelled keypoints (YOLO)
2. Labelled keypoints video -> directory of images (split into frames)
3. populate labelling table with start/end frames of k=10-frame sequences with l=5 overlap, split by target person

Not in this notebook:
    4. [Manual labelling]
    5. feature generation
    6. punch classification model (RNN/LSTM)
    

keypoints format:

video id | sequence number | internal frame number(0 at seqstart) | target person | keypoints (as list)

-> order keypoints by the left-to-right order of their appearance in the first frame of the sequence

-> target person numbering resets per sequence

labelling format (punch):

video id | sequence number | start frame | end frame (=start+k) | target person | punch type (0-3) | include? (0-1)

-> 0 = no punch; 1 = straight; 2 = hook; 3 = uppercut



training data format:

from labelling table, take rows with include==1 and access their keypoints by video id, start frame and target person

video id | sequence number | start frame | end frame | target person | punch type | keypoints (as list)

In [1]:
import numpy as np
from ultralytics import YOLO
import cv2
import os

In [2]:
FRAMES_PER_SEQUENCE = 10
OVERLAP = 5

In [3]:
#Step 1: Extract keypoints from video
def extractKeypoints(video_fp,outputsDirectory,model):
    """
    Extract and return keypoints tensor from video_fp using YOLOv8l 2D pose estimation model
    """
    
    model = YOLO(f"{model}-pose.pt")
    results = model(video_fp, project=output_dir, name=output_name, stream=False, save=True,max_det=5, save_conf=True, vid_stride=2, conf=0.4)
    return results

In [4]:
#Step 2: split labelled keypoints video into a folder of individual frames at dir frames_fp
def splitVideoToFrames(labelledvideo_fp,frames_fp):
    """
    Split video of labelled keypoints at labelledvideo_fp into individual frames
    Save in directory frames_fp as jpg files
    """
    if not os.path.exists(frames_fp): os.makedirs(frames_fp)
    video = cv2.VideoCapture(labelledvideo_fp)
    if not video.isOpened():
        print(f"Error opening video {labelledvideo_fp}")
    i = 1
    while True:
        ret, frame = video.read()
        if not ret: break
        frame_filename = os.path.join(frames_fp, f"frame_{i:04d}.jpg")
        cv2.imwrite(frame_filename, frame)
        i += 1
    video.release()

In [162]:
#Step 3: Assign person IDs to each character by sequence

def assignIDNumbers(framesDir):
    """
    Assign person IDs to each character in each sequence
    Ordering: 
    
    Return value: dictionary of [1-indexed sequence number]-> dictionary of 
    [1-indexed character number]->[index in keypoints for each frame in sequence]
    
    Assignment order:
     - Left-to-right in first frame of sequence
     - If a person doesn't appear in the first frame of a sequence don't classify him.
     - One-indexed
    """
    
    def modified_cossim(v1,v2):
        """
        Helper function
        Modified cosine similarity: only nonzero elements are considered 
        """
        v1p = np.where((v1 != 0) & (v2 != 0), v1, 0)
        v2p = np.where((v1 != 0) & (v2 != 0), v2, 0)
        if np.sum(v1p)==0 or np.sum(v2p)==0:
            return 0
        return (v1p @ v2p)/(np.linalg.norm(v1p)*np.linalg.norm(v2p))
        
    def sortfn(x):
        """
        Helper function
        We will order people left-to-right by the minimum x coordinate of their nonzero 
        keypoints in the first frame of the sequence
        """
        arr = np.array(sequence[0].keypoints.xy[x-1].tolist())[:,0]
        if len(arr[arr>0])>0:
            return arr[arr > 0].min()
        return 0

    def findCameraChanges(framesDir, threshold=0.8):
        """
        Helper function
        Detect camera angle changes in a directory of frames. Return a list of zero-indexed frame numbers of such changes        
        """
        angle_change_frames = []
        prev_hist = None
    
        files = sorted(os.listdir(framesDir))
        
        for i,file in enumerate(files):
            if not file.endswith('.jpg'): continue  # Skip non-JPG files
            
            frame_path = os.path.join(framesDir, file)
            frame = cv2.imread(frame_path)
    
            if frame is None: 
                print(f'Found unreadable frame: {i}')
                continue #skip unreadable frames
            
            curr_hist = cv2.calcHist([frame], [0], None, [256], [0, 256])
            
            if prev_hist is not None:
                # Compare curr frame's histogram with prev frame's histogram
                similarity = cv2.compareHist(prev_hist, curr_hist, cv2.HISTCMP_CORREL)
                
                if similarity < threshold:
                    angle_change_frames.append(int(file[-8:-4])-1)
            
            prev_hist = curr_hist
    
        return angle_change_frames
    
    ID_assignments = dict()
    num_frames = len(results)

    angleChangeFrames = findCameraChanges(framesDir)    
    for s in range(0,num_frames,OVERLAP):
        print(f"Processing sequence {int(s/OVERLAP) + 1}")
        sequence = results[s:s+FRAMES_PER_SEQUENCE]
        if len(sequence) != FRAMES_PER_SEQUENCE: break #means we're done processing

        #ensure that the sequence doesn't contain any frames with camera angle changes
        skip = False
        for i in range(s,s+FRAMES_PER_SEQUENCE):
            if i in angleChangeFrames:
                skip = True
                break
        if skip: continue

        #ensure that the sequence doesn't contain any frames without ID'd characters
        skip = False
        for i in range(FRAMES_PER_SEQUENCE):
            if len(sequence[i].keypoints.xy[0])==0:
                skip = True     
                break
        if skip: continue
    
        #characters dictionary
        #character with key i will contain a list which is his tensor array indexes for each frame
        characters = dict() 
    
        #name pts left to rights from first frame in seq
        order = sorted([r+1 for r in range(len(sequence[0].keypoints.xy))], key =sortfn) 
        for r in range(len(sequence[0].keypoints.xy)): characters[order[r]] = [r]
    
        sims = [] #keep track of avg similarity at each new layer
        bestsim = 0
        
        #match indexes at curr "layer" to those from prev layer "layer-1"
        for layer in range(1,FRAMES_PER_SEQUENCE):                
            
            indexes = [r for r in range(len(sequence[layer].keypoints.xy))]
            seq_num = int(s/OVERLAP) + 1
                
            #match every character in "layer" one at a time
            for key in characters.keys():
                #if character doesnt exist in prev layer then skip it
                if characters[key][-1] >= len(sequence[layer-1].keypoints.xy): continue
    
                #vector of character[key] in prev layer
                v1 = np.array(sequence[layer-1].keypoints.xy[characters[key][-1]].tolist()).flatten()
                bestindex = -1; maxsim = 0
    
                #if we've used up all of the indexes just break
                if not indexes:
                    characters[key].append(-1)
                    continue
    
                #figure out the best index to match to character[key] in layer
                for index in indexes:
                    v2 = np.array(sequence[layer].keypoints.xy[index].tolist()).flatten()
                    sim = modified_cossim(v1,v2)
                    if sim >= maxsim:
                        maxsim = sim
                        bestindex = index
                
                characters[key].append(bestindex)
                if bestindex != -1: indexes.remove(bestindex)
    
        ID_assignments[int(s/OVERLAP)+1] = characters #key: 1-indexed seq number, value: character assignments dict
    return ID_assignments

**INSTRUCTIONS FOR USE:**

To run the preprocessing script, you need:
 - raw videos in file folder "raw", mp4 file format
 - created output directory "labelled"
Note that file paths are all relative, so keep this notebook in the same directory that contains directories "raw" and "labelled"

Additionally, the following libraries must be installed on your system: ultralytics, numpy, opencv-python (cv2)


In [6]:
rawvideos_dir = "raw"
video_fp = "raw/short10.mp4" #video path
output_dir = "labelled" #make sure it actually exists
model = "yolov8l"

output_name = f"{video_fp[len(rawvideos_dir)+1:][:-4]}"

labelledvideo_fp = f"{output_dir}/{output_name}/{video_fp[len(rawvideos_dir)+1:]}"
frames_fp = f"{output_dir}/{output_name}/frames"
keypoints_fp = f"{output_dir}/{output_name}/keypoints.csv"
table_fp = f"{output_dir}/{output_name}/table.csv"

In [7]:
results = extractKeypoints(video_fp,output_dir,model)
splitVideoToFrames(labelledvideo_fp,frames_fp)

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8l-pose.pt to 'yolov8l-pose.pt'...


100%|██████████| 85.3M/85.3M [00:13<00:00, 6.48MB/s]




errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs



[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


video 1/1 (1/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 4 persons, 3165.3ms
video 1/1 (2/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 3 persons, 2110.5ms
video 1/1 (3/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 3 persons, 1876.4ms
video 1/1 (4/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 4 persons, 1705.6ms
video 1/1 (5/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 5 persons, 1686.1ms
video 1/1 (6/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 4 persons, 1675.8ms
video 1/1 (7/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 5 persons, 1672.4ms
video 1/1 (8/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 4 persons, 1815.2ms
video 1/1 (9/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 4 persons, 2024.5ms
video 1/1 (10/125) /Users/yuchenli/Documents/CS229/raw/short10.mp4: 384x640 3 persons, 1827.5ms
video 1/1 (11/125) /Users/yuchenli/Documents/CS22

In [163]:
IDs = assignIDNumbers(frames_fp)

Processing sequence 1
Processing sequence 2
Processing sequence 3
Processing sequence 4
Processing sequence 5
Processing sequence 6
Processing sequence 7
Processing sequence 8
Processing sequence 9
Processing sequence 10
Processing sequence 11
Processing sequence 12
Processing sequence 13
Processing sequence 14
Processing sequence 15
Processing sequence 16
Processing sequence 17
Processing sequence 18
Processing sequence 19
Processing sequence 20
Processing sequence 21
Processing sequence 22
Processing sequence 23
Processing sequence 24
Processing sequence 25


In [9]:
IDs

{1: {3: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
  2: [2, 2, 2, 2, 2, 1, 2, 2, 2, 2],
  4: [3, -1, -1, 3, 4, 3, 4, 3, 3, -1]},
 2: {2: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 2, 2, 2, 2, 2, 2, 2, 2, 2],
  3: [2, 1, 1, 1, 1, 1, 1, 1, 1, 4],
  4: [3, 4, 3, 3, -1, -1, 3, 3, 3, 1]},
 3: {3: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 1, 1, 1, 4, 3, 4, 1, 1, 1],
  2: [2, 2, 2, 2, 2, 2, 2, 2, 2, 3]},
 4: {3: [0, 0, 0, 0, 0, 2, 2, 2, 3, 0],
  1: [1, 4, 1, 1, 1, 0, 1, 0, 2, 1],
  4: [2, 2, 2, 2, 3, 1, 0, 1, 0, 2],
  2: [3, 3, 3, 3, 4, -1, 4, 4, 1, -1],
  5: [4, 1, -1, -1, 2, 3, 3, 4]},
 5: {3: [0, 1, 0, 2, 1, 2, 2, 2, 2, 1],
  2: [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  1: [2, 2, 2, 3, 2, 1, 1, 1, 1, 2]},
 6: {2: [0, 2, 2, 2, 1, 0, 1, 1, 1, 0],
  1: [1, 1, 1, 1, 2, 2, 2, 2, 3, 1],
  3: [2, 0, 0, 0, 0, 1, 0, 0, 0, 3],
  4: [3, 4, 3, 4, 3, 3, 3, 3, 4, 2],
  5: [4, 3, 4, 3, -1, 4, -1, 4, 2, -1]},
 7: {4: [0, 1, 1, 1, 0, 0, 0, 1, 0, 0],
  3: [1, 0, 0, 0, 1, 1, 1, 0, 1, 1],
  2: 

# Write keypoints to CSV

In [75]:
#write keypoints data to csv
#after this, another fn needs to populate the labelling table. 
lines = []
import csv 

for sequence in IDs.keys():
    seqstart = (sequence-1)*OVERLAP   
    for internal_frame in range(FRAMES_PER_SEQUENCE):
        external_frame = seqstart + internal_frame
        for person in IDs[sequence].keys():
            if len(IDs[sequence][person]) > internal_frame and IDs[sequence][person][internal_frame]!=-1:
                index = IDs[sequence][person][internal_frame]
                #print(index)
                #print(results[external_frame].keypoints.xy)
                keypoints = np.array(results[external_frame].keypoints.xy[index].tolist()).flatten().tolist()
                line = [output_name, sequence, internal_frame, person] + keypoints
                lines.append(line)
                
with open(keypoints_fp, 'w', newline='') as file:
    writer = csv.writer(file)
    # Write all lines to the CSV file
    writer.writerows(lines)            

## Debugging code for fixing a bug (now fixed)

In [40]:
for i in IDs.keys():
    for j,pos_index in enumerate(IDs[i].values()):
        if not len(pos_index) == 10:
            frame_start = i * OVERLAP + 1
            print(f'Sequence {i}, frame_start {frame_start} found {pos_index}')

Sequence 4, frame_start 21 found [4, 1, -1, -1, 2, 3, 3, 4]
Sequence 7, frame_start 36 found [4, -1, 4, 2, -1, -1, 2]
Sequence 9, frame_start 46 found [4, 4, 3, 2, -1, 3, 2, 3]
Sequence 10, frame_start 51 found [4, 3, 3, 2, 3, 4, 4]
Sequence 15, frame_start 76 found [4, -1, -1, 2, 1, 1, 3, 4, 3]
Sequence 16, frame_start 81 found [4, 4, 3, 3, 2, 2, 3, 3]
Sequence 20, frame_start 101 found [4, 4, 4, 4, 2, 3, 3, 2]
Sequence 21, frame_start 106 found [2, 2, 2, 2, 3, 4, 1, 2, -1]
Sequence 21, frame_start 106 found [3, 3, -1, 3, 4, 0, 2, 3]
Sequence 21, frame_start 106 found [4, 4, 2, 3, -1, 4]
Sequence 22, frame_start 111 found [2, 3, 1, -1, -1]
Sequence 22, frame_start 111 found [3, 2, 3]
Sequence 22, frame_start 111 found [4, -1, 4]


In [61]:
ID_test = assignIDNumbers(f"{output_dir}/{output_name}/frames")
ID_test

[93]
Processing sequence 1
Processing sequence 2
Processing sequence 3
Processing sequence 4
Processing sequence 5
Processing sequence 6
Processing sequence 7
Processing sequence 8
Processing sequence 9
Processing sequence 10
Processing sequence 11
Processing sequence 12
Processing sequence 13
Processing sequence 14
Processing sequence 15
Processing sequence 16
Processing sequence 17
Processing sequence 18
Processing sequence 19
Processing sequence 20
Processing sequence 21
Processing sequence 22
Local frame 1. Indexes = [0, 1, 2, 3] 
Characters = {3: [0], 2: [1], 4: [2], 5: [3], 1: [4]}
Checking character 3
For 3, best positional index based on cossim is 0
For 3, best positional index based on cossim is 1
Checking character 2
For 2, best positional index based on cossim is 0
Checking character 4
For 4, best positional index based on cossim is 2
For 4, best positional index based on cossim is 3
Checking character 5
For 5, best positional index based on cossim is 2
Checking character 1


{1: {3: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
  2: [2, 2, 2, 2, 2, 1, 2, 2, 2, 2],
  4: [3, -1, -1, 3, 4, 3, 4, 3, 3, -1]},
 2: {2: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 2, 2, 2, 2, 2, 2, 2, 2, 2],
  3: [2, 1, 1, 1, 1, 1, 1, 1, 1, 4],
  4: [3, 4, 3, 3, -1, -1, 3, 3, 3, 1]},
 3: {3: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  1: [1, 1, 1, 1, 4, 3, 4, 1, 1, 1],
  2: [2, 2, 2, 2, 2, 2, 2, 2, 2, 3]},
 4: {3: [0, 0, 0, 0, 0, 2, 2, 2, 3, 0],
  1: [1, 4, 1, 1, 1, 0, 1, 0, 2, 1],
  4: [2, 2, 2, 2, 3, 1, 0, 1, 0, 2],
  2: [3, 3, 3, 3, 4, -1, 4, 4, 1, -1],
  5: [4, 1, -1, -1, 2, -1, 3, 3, 4, -1]},
 5: {3: [0, 1, 0, 2, 1, 2, 2, 2, 2, 1],
  2: [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  1: [2, 2, 2, 3, 2, 1, 1, 1, 1, 2]},
 6: {2: [0, 2, 2, 2, 1, 0, 1, 1, 1, 0],
  1: [1, 1, 1, 1, 2, 2, 2, 2, 3, 1],
  3: [2, 0, 0, 0, 0, 1, 0, 0, 0, 3],
  4: [3, 4, 3, 4, 3, 3, 3, 3, 4, 2],
  5: [4, 3, 4, 3, -1, 4, -1, 4, 2, -1]},
 7: {4: [0, 1, 1, 1, 0, 0, 0, 1, 0, 0],
  3: [1, 0, 0, 0, 1, 1, 1, 0, 1, 1

## output frames with character labels

In [167]:
# save bounding boxes
lines = []
boxes_fp = f"{output_dir}/{output_name}/boxes.csv"

for i in range(len(results)):
    lines.append(results[i].boxes)

with open(boxes_fp, 'w', newline='') as file:
    writer = csv.writer(file)
    # Write all lines to the CSV file
    writer.writerows(lines)            

In [74]:
results[0].boxes.xyxy

tensor([[ 566.,   93.,  756.,  685.],
        [ 690.,   87.,  824.,  683.],
        [ 506.,   35.,  664.,  679.],
        [1166.,  289., 1274.,  385.]])

In [135]:
'''
Params: 

frame_path: filepath of the frame (image file) 
person_ids: 
'''

def draw_boxes_with_ids(frame_path, person_ids, positional_indexes, boxes):
    frame = cv2.imread(f'{frame_path}.jpg')
    
    for person in person_ids:
        x1, y1, x2, y2 = boxes.xyxy[positional_indexes[person]].numpy()

        # Display person ID
        cv2.putText(frame, str(person), (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 255, 255), 3)
    
    cv2.imwrite(f'{frame_path}_labels.jpg', frame)


In [136]:
for i_seq in sequences:
    person_ids = IDs[i_seq].keys()
    for i_frame in range(FRAMES_PER_SEQUENCE):
        external_frame = (i_seq-1) * OVERLAP + i_frame
        positional_indexes = {}
        for person in person_ids:
            positional_indexes[person] = IDs[i_seq][person][i_frame]
        
        frame_path = os.path.join(frames_fp, f"frame_{external_frame+1:04d}")
        
        draw_boxes_with_ids(frame_path, person_ids, positional_indexes, results[external_frame].boxes)

In [169]:
# Save IDs
import pickle
IDs_fp = f"{output_dir}/{output_name}/IDs.pkl"

with open(IDs_fp, 'wb') as file:
    pickle.dump(IDs, file)

### If a person disappears during a sequence i.e., (-1) positional index, remove them from the sequence

In [153]:
for frame in sorted(os.listdir(frames_fp)): 
    frame_path = os.path.join(frames_fp,frame)
    # true frame number (of vdeo)
    external_frame = int(frame_path.split('_')[1][:4])
    
    # sequence index
    seq = int(external_frame/OVERLAP) + 1
    
    # check if the sequence is actually kept (may have been deleted during preprocess)
    
    # frame within the sequence 
    internal_frame = external_frame % OVERLAP
    
    positional_indexes = {}
    person_ids = IDs[seq].keys()
    for person in person_ids:
        positional_indexes[person] = IDs[seq][person][internal_frame]

    frame_path = os.path.join(frames_fp, f"frame_{external_frame+1:04d}")

    draw_boxes_with_ids(frame_path, person_ids, positional_indexes, results[external_frame].boxes)

KeyError: 18