In [None]:
import cv2
import numpy as np

# Define a smaller input resolution (adjust as needed)
input_width = 320
input_height = 320

# Load YOLO (consider using a tiny version for speed)
net = cv2.dnn.readNet('cfg/yolov3-tiny.weights', 'cfg/yolov3-tiny.cfg')
classes = []
with open('cfg/coco.names', 'r') as f:
  classes = f.read().splitlines()

# Open video capture
cap = cv2.VideoCapture('videos/sst_foyer_1.mp4')

# Loop through video frames
while cap.isOpened():
  # Capture frame-by-frame
  ret, frame = cap.read()
  
  # Check if frame is read correctly
  if not ret:
      print("Can't receive frame (stream end?). Exiting...")
      break

  # Resize frame for the model
  frame_resized = cv2.resize(frame, (input_width, input_height))

  # Preprocess frame (normalize and swap RB channels)
  blob = cv2.dnn.blobFromImage(frame_resized, 1/255.0, (input_width, input_height), swapRB=True, crop=False)
  net.setInput(blob)

  # Forward pass through the network
  output_layers_names = net.getUnconnectedOutLayersNames()
  layer_outputs = net.forward(output_layers_names)

  # Process detection results
  boxes = []
  confidences = []
  class_ids = []
  for output in layer_outputs:
    for detection in output:
      scores = detection[5:]
      class_id = np.argmax(scores)
      confidence = scores[class_id]
      if confidence > 0.5:
        # Extract bounding box coordinates
        center_x = int(detection[0] * input_width)
        center_y = int(detection[1] * input_height)
        w = int(detection[2] * input_width)
        h = int(detection[3] * input_height)
        x = int(center_x - w / 2)
        y = int(center_y - h / 2)

        boxes.append([x, y, w, h])
        confidences.append(float(confidence))
        class_ids.append(class_id)

  # Non-maximum suppression (optional, can be commented out for speed)
  indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

  # Draw bounding boxes and labels on the original frame
  font = cv2.FONT_HERSHEY_PLAIN
  colors = np.random.uniform(0, 255, size=(len(classes), 3))
  for i in indexes.flatten():
    x, y, w, h = boxes[i]
    label = str(classes[class_ids[i]])
    confidence = str(round(confidences[i], 2))
    color = colors[i]
    cv2.rectangle(frame, (x, y), (x+w, y+h), color, 2)
    cv2.putText(frame, label + " " + confidence, (x, y + 20), font, 2, (255, 255, 255), 2)

  # Display the resulting frame
  cv2.imshow('SST Foyer Video', frame)

  # Exit if 'q' key is pressed
  if cv2.waitKey(1) == ord('q'):
    break

# Release capture and close windows
cap.release()
cv2.destroyAllWindows()
