In [1]:
from PIL import Image
import matplotlib.pyplot as plt
import torch
import numpy as np
import cv2
import mss
import time
import os
import warnings
import logging
from concurrent.futures import ThreadPoolExecutor
import pathlib

In [2]:
# CONSTANTS

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
VID_OUTPUT_DIR = 'video_out'        # Directory to save video output
DETECT = True                       # Whether or not to label during capture process, False to simply record and output videos. No output if true
FPS = 30                            # Frames to capture per second
SECONDS_TO_CAPTURE = 60             # Video duration

In [3]:
if DEVICE == torch.device('cuda'):
    print(f'Using {DEVICE} {torch.cuda.get_device_name()} with {torch.cuda.mem_get_info()[0]/(1024**2)} MB of VRAM')

Using cuda NVIDIA GeForce RTX 3050 Ti Laptop GPU with 3302.4500007629395 MB of VRAM


In [4]:
if not os.path.exists('yolov5'):
  !git clone https://github.com/ultralytics/yolov5
  !pip install -r yolov5/requirements.txt
  

if not os.path.exists(VID_OUTPUT_DIR):
  os.makedirs(VID_OUTPUT_DIR)
  
warnings.simplefilter("ignore", FutureWarning)
logging.getLogger('ultralytics').setLevel(logging.ERROR)

In [5]:
pathlib.PosixPath = pathlib.WindowsPath # https://github.com/ultralytics/yolov5/issues/10240#issuecomment-1662573188
model = torch.hub.load('ultralytics/yolov5', 'custom', path='./models/best.pt', force_reload=True)

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to C:\Users\bohui/.cache\torch\hub\master.zip
YOLOv5  2024-11-14 Python-3.11.5 torch-2.5.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3050 Ti Laptop GPU, 4096MiB)

Fusing layers... 
Model summary: 157 layers, 7018216 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


In [6]:
def detect(img, model):
  lanes = {
    0 : (10, 180),
    1 : (150, 320),
    2 : (300, 470),
    3 : (440, 610)
  }
  ret = []  
  res = model(img)
  
  for box in res.xyxy[0]:
    # Confidence level is less than 50%
    if box[4] < 0.50:
      continue
    
    x_center = int((box[0] + box[2]) / 2)
    y_center = int((box[1] + box[3]) / 2)
    class_id = int(box[5]) # classes are 0: end_hold, 1: note, 2: start_hold
    
    # Identify the lane of the note based on x_center
    for lane, (start, end) in lanes.items():
      if start <= x_center <= end:
        break
    
    ret.append([class_id, lane, y_center])
  
  if ret:
    ret = sorted(ret, key=lambda note: note[2], reverse=True)

  return ret 

def capture(region):
  with mss.mss() as sct:
    return sct.grab(region)

In [7]:
frames_to_capture = FPS * SECONDS_TO_CAPTURE
frames = []

monitor_1 = mss.mss().monitors[1]
print(f'Monitor: {monitor_1}')
t, l, w, h = monitor_1['top'], monitor_1['left'], monitor_1['width'], monitor_1['height']
region_1 = {'left': l+int(w * 0.338), 'top': t, 'width': w-int(w * 0.673), 'height': h} # Gameplay region
# region_2 = {'left': l+int(w * 0.67), 'top': t, 'width': w-int(w * 0.98), 'height': h-int(h * 0.33)} # Judgement counter region
# region_3 = {'left': l+int(l * 0.859), 'top': t, 'width': w-int(w * 0.820), 'height': h-int(h * 0.952)} # Score region

print(f'Capturing {region_1}')

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_1 = cv2.VideoWriter(f'{VID_OUTPUT_DIR}/output_1.mp4', fourcc, FPS, (region_1['width'], region_1['height']))

with ThreadPoolExecutor(max_workers=8) as executor:
  for i in range(frames_to_capture):
    time_start = time.time()
    thread_1 = executor.submit(capture, region_1)
    img_1 = thread_1.result()
    
    if DETECT:
      thread_1 = executor.submit(detect, np.array(img_1), model)
      notes = thread_1.result() # Outputs notes in [[class_id, lane_num, y_center_pos]...]
      print(notes)
    else:
      out_1.write(cv2.cvtColor(np.array(img_1), cv2.COLOR_BGRA2BGR))
    
    # If the current loop finishes faster than the FPS, wait.
    if time_start + 1/FPS > time.time():
      time.sleep(time_start + 1/FPS - time.time())

if not DETECT:
  out_1.release()
  print(f'Capture region saved to {VID_OUTPUT_DIR}/output_1.mp4')

Monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}
Capturing {'left': 648, 'top': 0, 'width': 628, 'height': 1080}
[[2, 0, 736], [0, 0, 424], [1, 1, 155]]
[[0, 0, 646], [1, 1, 377], [1, 1, 81], [1, 2, 80]]
[[0, 0, 667], [1, 1, 398], [1, 2, 103], [1, 1, 103]]
[[0, 0, 687], [1, 1, 421], [1, 2, 125], [1, 1, 125]]
[[0, 0, 710], [1, 1, 441], [1, 1, 148], [1, 2, 147]]
[[0, 0, 732], [1, 1, 467], [1, 2, 171], [1, 1, 171]]
[[0, 0, 757], [1, 1, 489], [1, 1, 195], [1, 2, 194]]
[[0, 0, 778], [1, 1, 511], [1, 2, 216], [1, 1, 216]]
[[0, 0, 802], [1, 1, 534], [1, 2, 241], [1, 1, 240]]
[[0, 0, 824], [1, 1, 558], [1, 2, 263], [1, 1, 263], [2, 2, 20]]
[[0, 0, 846], [1, 1, 580], [1, 1, 285], [1, 2, 284], [2, 2, 30]]
[[0, 0, 862], [1, 1, 599], [1, 1, 306], [1, 2, 305], [2, 2, 41]]
[[0, 0, 874], [1, 1, 622], [1, 2, 327], [1, 1, 327], [2, 2, 53]]
[[1, 1, 647], [1, 2, 353], [1, 1, 353], [2, 2, 65]]
[[1, 1, 668], [1, 2, 373], [1, 1, 373], [2, 2, 75]]
[[1, 1, 692], [1, 2, 397], [1, 1, 397], [2, 2, 9

KeyboardInterrupt: 