In [2]:
# Annotate a saved video every other second
import cv2
from tqdm.notebook import tqdm
import time
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import random
from IPython.display import Video
from yolo_model import YoloS
from transformers import AutoFeatureExtractor
from PIL import ImageDraw
from utils import rescale_bboxes
import torch
import wandb


torch.cuda.empty_cache()
experiment_name = 'yolo-tiny-deer-non-deer'
device = 'cpu'

# Load the model
model = YoloS.load_from_checkpoint(f"checkpoints/{experiment_name}.ckpt", num_labels=2)
feature_extractor = AutoFeatureExtractor.from_pretrained("hustvl/yolos-small")
model.eval()
model.to(device)

Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-tiny and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([3]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 192]) in the checkpoint and torch.Size([3, 192]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


YoloS(
  (model): YolosForObjectDetection(
    (vit): YolosModel(
      (embeddings): YolosEmbeddings(
        (patch_embeddings): YolosPatchEmbeddings(
          (projection): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (interpolation): InterpolateInitialPositionEmbeddings()
      )
      (encoder): YolosEncoder(
        (layer): ModuleList(
          (0-11): 12 x YolosLayer(
            (attention): YolosAttention(
              (attention): YolosSelfAttention(
                (query): Linear(in_features=192, out_features=192, bias=True)
                (key): Linear(in_features=192, out_features=192, bias=True)
                (value): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): YolosSelfOutput(
                (dense): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Drop

In [3]:
wandb.login()
# Get active experiment
run = wandb.init(project='deer_detection', name=experiment_name)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwhitew1994[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [29]:
def get_frame_bbox(image, model, feature_extractor, image_shape, threshold=0.5, device='cpu'):
    """
    Runs the model on the input PIL image and returns the bounding boxes and class probabilities
    """
    # Convert the frame to a torch tensor
    preprocessed_image = feature_extractor(images=image, return_tensors="pt")

    image_tensor = preprocessed_image['pixel_values'].squeeze().unsqueeze(0).to(device)
    # Get the model predictions
    outputs = model(pixel_values=image_tensor)

    # Get the predictions
    probas = outputs.logits.softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold

    bboxes_scaled = rescale_bboxes(outputs.pred_boxes[0, keep].cpu(), image_shape)
    return bboxes_scaled, probas[keep]  

def annotate_frame(image, bboxes_scaled, probas, id2label, scaled_font):
    # Draw the predictions
    draw = ImageDraw.Draw(image)
    for p, (xmin, ymin, xmax, ymax) in zip(probas, bboxes_scaled.tolist()):
        cl = p.argmax()
        text = f'{id2label[cl.item()]}: {p[cl]:0.2f}'
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline=(255, 0, 0), width=3)
        # Write large text
        draw.text((xmin, ymin), text, fill=(255, 0, 0), font=scaled_font)
    return image

def frame_to_pil(frame):
    # Convert the frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert the frame to a PIL image
    image = Image.fromarray(frame_rgb)
    return image

def pil_to_frame(image):
    # Convert the frame back to a numpy array
    image = np.array(image)
    # Convert the frame back to BGR
    frame = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return frame

def annotate_and_save_video(video_path, output_video_folder, model, feature_extractor, id2label, scaled_font, threshold=0.5, annotation_frequency=0.5):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    fps = round(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"FPS: {fps}")
    print(f"Frame count: {frame_count}")


    # Create the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video_path = os.path.join(output_video_folder, f"{video_filenames[video_to_show_index]}")
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (int(cap.get(3)), int(cap.get(4))))


    # Get the predictions for every frame
    # Annotate the video
    bboxes_scaled = []
    for frame_idx in tqdm(range(frame_count)):
        ret, frame = cap.read()
        if frame_idx == 0:
            image_shape = frame.shape[:2][::-1]
        if ret:
            pil_image = frame_to_pil(frame)

            # Get model predictions at the annotation frequency
            if frame_idx % round(fps * annotation_frequency) == 0:
                print(f"Running frame {frame_idx} through YOLO")
                bboxes_scaled, probas = get_frame_bbox(pil_image, model, feature_extractor, image_shape, threshold)

            # Annotate the frame
            annotated_image = annotate_frame(pil_image, bboxes_scaled, probas, id2label, scaled_font)
            
            frame = pil_to_frame(annotated_image)
            # Write the frame to the output video
            out.write(frame)
        else:
            break

    # Release the video capture and writer
    cap.release()
    out.release()


In [30]:
from PIL import ImageFont

test_video_folder = '../test_videos'
output_video_folder = '../output_videos'

if not os.path.exists(output_video_folder):
    os.makedirs(output_video_folder)


threshold = 0.5
annotation_frequency = 0.5 # In seconds


id2label = {0: 'deer', 1: 'non-deer'}
font = 'arial.ttf'
scaled_font = ImageFont.truetype(font, 30)


video_filenames = [path for path in os.listdir(test_video_folder) if path.endswith('.mp4')]
for video_to_show_index in range(len(video_filenames)):
    video_path = os.path.join(test_video_folder, video_filenames[video_to_show_index])
    print(f"Annotating video {video_path}")
    annotate_and_save_video(video_path, output_video_folder, model, feature_extractor, id2label, scaled_font, threshold, annotation_frequency)
    run.save(output_video_path)


Annotating video ../test_videos\Barking Roe Deer.mp4
FPS: 30
Frame count: 2412


  0%|          | 0/2412 [00:00<?, ?it/s]

Running frame 0 through YOLO
Running frame 15 through YOLO
Running frame 30 through YOLO
Running frame 45 through YOLO
Running frame 60 through YOLO
Running frame 75 through YOLO
Running frame 90 through YOLO
Running frame 105 through YOLO
Running frame 120 through YOLO
Running frame 135 through YOLO
Running frame 150 through YOLO
Running frame 165 through YOLO
Running frame 180 through YOLO
Running frame 195 through YOLO
Running frame 210 through YOLO
Running frame 225 through YOLO
Running frame 240 through YOLO
Running frame 255 through YOLO
Running frame 270 through YOLO
Running frame 285 through YOLO
Running frame 300 through YOLO
Running frame 315 through YOLO
Running frame 330 through YOLO
Running frame 345 through YOLO
Running frame 360 through YOLO
Running frame 375 through YOLO
Running frame 390 through YOLO
Running frame 405 through YOLO
Running frame 420 through YOLO
Running frame 435 through YOLO
Running frame 450 through YOLO
Running frame 465 through YOLO
Running frame 48



Annotating video ../test_videos\Shorteared owl and Hare Velduil en haas.mp4
FPS: 25
Frame count: 750


  0%|          | 0/750 [00:00<?, ?it/s]

Running frame 0 through YOLO
Running frame 12 through YOLO
Running frame 24 through YOLO
Running frame 36 through YOLO
Running frame 48 through YOLO
Running frame 60 through YOLO
Running frame 72 through YOLO
Running frame 84 through YOLO
Running frame 96 through YOLO
Running frame 108 through YOLO
Running frame 120 through YOLO
Running frame 132 through YOLO
Running frame 144 through YOLO
Running frame 156 through YOLO
Running frame 168 through YOLO
Running frame 180 through YOLO
Running frame 192 through YOLO
Running frame 204 through YOLO
Running frame 216 through YOLO
Running frame 228 through YOLO
Running frame 240 through YOLO
Running frame 252 through YOLO
Running frame 264 through YOLO
Running frame 276 through YOLO
Running frame 288 through YOLO
Running frame 300 through YOLO
Running frame 312 through YOLO
Running frame 324 through YOLO
Running frame 336 through YOLO
Running frame 348 through YOLO
Running frame 360 through YOLO
Running frame 372 through YOLO
Running frame 384 

  0%|          | 0/458 [00:00<?, ?it/s]

Running frame 0 through YOLO
Running frame 15 through YOLO
Running frame 30 through YOLO
Running frame 45 through YOLO
Running frame 60 through YOLO
Running frame 75 through YOLO
Running frame 90 through YOLO
Running frame 105 through YOLO
Running frame 120 through YOLO
Running frame 135 through YOLO
Running frame 150 through YOLO
Running frame 165 through YOLO
Running frame 180 through YOLO
Running frame 195 through YOLO
Running frame 210 through YOLO
Running frame 225 through YOLO
Running frame 240 through YOLO
Running frame 255 through YOLO
Running frame 270 through YOLO
Running frame 285 through YOLO
Running frame 300 through YOLO
Running frame 315 through YOLO
Running frame 330 through YOLO
Running frame 345 through YOLO
Running frame 360 through YOLO
Running frame 375 through YOLO
Running frame 390 through YOLO
Running frame 405 through YOLO
Running frame 420 through YOLO
Running frame 435 through YOLO
Running frame 450 through YOLO
Annotating video ../test_videos\Wildlife in Sco

  0%|          | 0/4157 [00:00<?, ?it/s]

Running frame 0 through YOLO
Running frame 15 through YOLO
Running frame 30 through YOLO
Running frame 45 through YOLO
Running frame 60 through YOLO
Running frame 75 through YOLO
Running frame 90 through YOLO
Running frame 105 through YOLO
Running frame 120 through YOLO
Running frame 135 through YOLO
Running frame 150 through YOLO
Running frame 165 through YOLO
Running frame 180 through YOLO
Running frame 195 through YOLO
Running frame 210 through YOLO
Running frame 225 through YOLO
Running frame 240 through YOLO
Running frame 255 through YOLO
Running frame 270 through YOLO
Running frame 285 through YOLO
Running frame 300 through YOLO
Running frame 315 through YOLO
Running frame 330 through YOLO
Running frame 345 through YOLO
Running frame 360 through YOLO
Running frame 375 through YOLO
Running frame 390 through YOLO
Running frame 405 through YOLO
Running frame 420 through YOLO
Running frame 435 through YOLO
Running frame 450 through YOLO
Running frame 465 through YOLO
Running frame 48