In [1]:
from __future__ import division

import os
import time

import cv2
import numpy as np

# IMAGE WITH YOLOv3

In [14]:
LABELSPATH = '../../darknet/data/coco.names'
CONFIGPATH = '../../darknet/cfg/yolov3.cfg'
WEIGHTSPATH = '../../darknet/yolov3.weights'

CONFIDENCE_THS = 0.5
NMS_THS = 0.3

In [15]:
LABELS = open(LABELSPATH).read().strip().split("\n")

np.random.seed(42)
COLORS = np.random.randint(0,255,size=(len(LABELS),3),dtype="uint8")

net = cv2.dnn.readNetFromDarknet(CONFIGPATH,WEIGHTSPATH)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

In [16]:
def test_on_image(net,image):
    (H,W) = image.shape[:2]

    output_ln = net.getLayerNames()
    output_ln = [output_ln[i[0]-1] for i in net.getUnconnectedOutLayers()]

    blob = cv2.dnn.blobFromImage(image,1/255.0,(416,416),swapRB=True,crop=False)
    net.setInput(blob)
    start = time.time()
    layer_outputs = net.forward(output_ln)
    end = time.time()

    # output storages
    boxes = []
    confidences = []
    classIDs = []

    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]

            # filter out weak predictions
            if confidence > CONFIDENCE_THS:
                # scale the bounding box coordinates back relative to the size of the image
                box = detection[0:4] * np.array([W,H,W,H])
                (centerX,centerY,width,height) = box.astype("int")

                # derive the top left corner of the bounding box
                x = int(centerX-(width/2))
                y = int(centerY-(height/2))

                # update our lists
                boxes.append([x,y,int(width),int(height)])
                confidences.append(float(confidence))
                classIDs.append(classID)
    
    idxs = cv2.dnn.NMSBoxes(boxes,confidences,CONFIDENCE_THS,NMS_THS)

    return idxs,boxes,confidences,classIDs,end-start

In [17]:
def draw_outputs(idxs,boxes,confidences,classIDs,image):
    if len(idxs) > 0:
        for i in idxs.flatten():
            (x,y) = (boxes[i][0],boxes[i][1])
            (w,h) = (boxes[i][2],boxes[i][3])

            color = [int(c) for c in COLORS[classIDs[i]]]
            cv2.rectangle(image,(x,y),(x+w,y+h),color,2)
            text = "{}:{:.4f}".format(LABELS[classIDs[i]],confidences[i])
            print("[INFO] YOLO predicts {} with {:.6f} confidence".format(LABELS[classIDs[i]],confidences[i]))
            cv2.putText(image,text,(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,0.5,color,2)
    
    return image

In [19]:
imagefile = '../images/dog-cycle-car.png'
image = cv2.imread(imagefile)
idxs,boxes,confidences,classIDs,duration = test_on_image(net,image)
print("[INFO] YOLO took {:.6f} secs".format(duration))
output_image = draw_outputs(idxs,boxes,confidences,classIDs,image)

[INFO] YOLO took 0.280167 secs
[INFO] YOLO predicts dog with 0.999602 confidence
[INFO] YOLO predicts bicycle with 0.993697 confidence
[INFO] YOLO predicts truck with 0.883534 confidence


[ WARN:0] global /home/yeojinjung/opencv/opencv-4.5.0/modules/dnn/src/cuda4dnn/init.hpp (42) checkVersions CUDART version 11010 reported by cuDNN 8005 does not match with the version reported by CUDART 10020


In [20]:
cv2.imshow("",output_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

# WEBCAM WITH YOLOv3

In [6]:
LABELSPATH = '../../darknet/data/coco.names'
CONFIGPATH = '../../darknet/cfg/yolov3.cfg'
WEIGHTSPATH = '../../darknet/yolov3.weights'

CONFIDENCE_THS = 0.5
NMS_THS = 0.3

In [7]:
LABELS = open(LABELSPATH).read().strip().split("\n")

np.random.seed(42)
COLORS = np.random.randint(0,255,size=(len(LABELS),3),dtype="uint8")

net = cv2.dnn.readNetFromDarknet(CONFIGPATH,WEIGHTSPATH)

In [22]:
def test_on_webcam():
    # camset = 'v4l2src device=/dev/video2 ! video/x-raw,width=1280,height=800,format=(string)YUY2 ! videoconvert ! videoscale ! video/x-raw,width=640,height=480,format=BGR,framerate=30/1 ! appsink'
    camset = "v4l2src ! video/x-raw,format=YUY2,width=640,height=480,framerate=30/1 ! appsink"    

    cam = cv2.VideoCapture(camset, cv2.CAP_GSTREAMER)
    cam.set(cv2.CAP_PROP_FRAME_WIDTH,640)
    cam.set(cv2.CAP_PROP_FRAME_HEIGHT,480)

    # counter and storage to compute average prediction time
    cnt = 0
    sum_durations = 0

    if cam.read() == False:
        cam.open()
    if not cam.isOpened():
        raise IOError("cannot open webcam")

    while cam.isOpened():
        # start = time.time()
        ret,frame = cam.read()

        if not ret:
            raise IOError("cannot receive frame")
        
        idxs,boxes,confidences,classIDs,duration = test_on_image(net,frame)
        sum_durations += duration
        cnt += 1
        
        if len(idxs) > 0 :
            for i in idxs.flatten():
                (x,y) = (boxes[i][0],boxes[i][1])
                (w,h) = (boxes[i][2],boxes[i][3])

                # draw bounding box and label on the frame
                color = [int(c) for c in COLORS[classIDs[i]]]
                text = "{}: {:.4f}".format(LABELS[classIDs[i]],confidences[i])
                frame = cv2.rectangle(frame,(x,y),(x+w,y+h),color,2)
                frame = cv2.putText(frame,text,(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,0.5,color,2)

            
        cv2.imshow('',frame)
        if cv2.waitKey(1) == ord('q'):
            break

    cam.release()
    cv2.destroyAllWindows()

    print("[INFO] YOLO took {:.6f} secs per frame in average".format(sum_durations/cnt))

In [12]:
test_on_webcam()



[INFO] YOLO took 0.238926 secs per frame in average


In [18]:
print(cv2.ocl.haveOpenCL())
print(cv2.ocl.Device_TYPE_CPU)

True
2


# WEBCAM WITH YOLOv4

## Major Improvements
- BoF (bag of freebies) improve the accuracy of the detector without increasing the inference time (they only increase the training cost)
- BoS (bag of specials) improves the accuracy of object detection while increasing the inference cost by a small amount

## Performance
- mAP: 43.5% on COCO dataset (10% increase from YOLOv3)
- real-time speed: 65 FPS on Tesla V100 (12% increase from YOLOv3)

In [18]:
LABELSPATH = '../../darknet-alex/data/coco.names'
CONFIGPATH = '../../darknet-alex/cfg/yolov4.cfg'
WEIGHTSPATH = '../../darknet-alex/yolov4.weights'

CONFIDENCE_THS = 0.5
NMS_THS = 0.3

In [23]:
test_on_webcam()

OSError: cannot open webcam

In [12]:
os.system("cd ../../darknet-alex/ && ./darknet detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights -c 0")

 CUDA-version: 10020 (11020), cuDNN: 8.0.5, CUDNN_HALF=1, GPU count: 1  
 OpenCV version: 4.5.0
 0 : compute_capability = 750, cudnn_half = 1, GPU: GeForce RTX 2060 
   layer   filters  size/strd(dil)      input                output
   0 

 CUDNN_HALF=1 


conv     32       3 x 3/ 1    608 x 608 x   3 ->  608 x 608 x  32 0.639 BF
   1 conv     64       3 x 3/ 2    608 x 608 x  32 ->  304 x 304 x  64 3.407 BF
   2 conv     64       1 x 1/ 1    304 x 304 x  64 ->  304 x 304 x  64 0.757 BF
   3 route  1 		                           ->  304 x 304 x  64 
   4 conv     64       1 x 1/ 1    304 x 304 x  64 ->  304 x 304 x  64 0.757 BF
   5 conv     32       1 x 1/ 1    304 x 304 x  64 ->  304 x 304 x  32 0.379 BF
   6 conv     64       3 x 3/ 1    304 x 304 x  32 ->  304 x 304 x  64 3.407 BF
   7 Shortcut Layer: 4,  wt = 0, wn = 0, outputs: 304 x 304 x  64 0.006 BF
   8 conv     64       1 x 1/ 1    304 x 304 x  64 ->  304 x 304 x  64 0.757 BF
   9 route  8 2 	                           ->  304 x 304 x 128 
  10 conv     64       1 x 1/ 1    304 x 304 x 128 ->  304 x 304 x  64 1.514 BF
  11 conv    128       3 x 3/ 2    304 x 304 x  64 ->  152 x 152 x 128 3.407 BF
  12 conv     64       1 x 1/ 1    152 x 152 x 128 ->  152 x 152 x  64 0.379 BF
 

Demo
net.optimized_memory = 0 
mini_batch = 1, batch = 8, time_steps = 1, train = 0 
Create CUDA-stream - 0 
 Create cudnn-handle 0 
nms_kind: greedynms (1), beta = 0.600000 
nms_kind: greedynms (1), beta = 0.600000 
nms_kind: greedynms (1), beta = 0.600000 

 seen 64, trained: 32032 K-images (500 Kilo-batches_64) 
Webcam index: 0


Gtk-Message: 11:22:51.061: Failed to load module "canberra-gtk-module"


Video stream: 1280 x 720 
Objects:


FPS:0.0 	 AVG_FPS:0.0
Objects:

tvmonitor: 97% 
tvmonitor: 88% 
tvmonitor: 37% 
tvmonitor: 26% 
pottedplant: 32% 
pottedplant: 30% 
pottedplant: 26% 
chair: 84% 
chair: 40% 
chair: 39% 
chair: 30% 
person: 87% 
person: 51% 
person: 41% 

FPS:0.7 	 AVG_FPS:0.0
Objects:

tvmonitor: 94% 
tvmonitor: 63% 
pottedplant: 33% 
pottedplant: 29% 
pottedplant: 29% 
chair: 82% 
chair: 46% 
chair: 35% 
chair: 31% 
person: 88% 
person: 55% 
person: 45% 
person: 28% 

FPS:1.1 	 AVG_FPS:0.0
Objects:

tvmonitor: 96% 
tvmonitor: 49% 
diningtable: 29% 
pottedplant: 31% 
chair: 84% 
chair: 52% 
chair: 38% 
handbag: 30% 
person: 89% 
person: 69% 
person: 26% 

FPS:1.5 	 AVG_FPS:0.0
Objects:

tvmonitor: 95% 
tvmonitor: 57% 
pottedplant: 42% 
pottedplant: 36% 
chair: 84% 
chair: 43% 
chair: 28% 
handbag: 30% 
person: 85% 
person: 51% 

FPS:1.9 	 AVG_FPS:0.0
Objects:

tvmonitor: 98% 
tvmonitor: 53% 
diningtable: 26% 
pottedplant: 31% 
pottedplant: 29% 
chair: 86% 
chair: 41

2

# WEBCAM WITH TINY-YOLOv4

In [None]:
LABELSPATH = '../../darknet-alex/data/coco.names'
CONFIGPATH = '../../darknet-alex/cfg/yolov4-tiny.cfg'
WEIGHTSPATH = '../../darknet-alex/yolov4-tiny.weights'

CONFIDENCE_THS = 0.5
NMS_THS = 0.3

In [None]:
test_on_webcam()

[INFO] YOLO took 0.260241 secs per frame in average


# WEBCAM WITH PP-YOLO
- based on YOLOv3 in PaddleDetection
- goal: relatively balanced effectiveness and efficiency that can be directly applied in actual application scenarios rather than proposing a new detection model

## Major Improvements
- replace Darknet53 backbone of YOLOv3 with ResNet backbone (significant increase in the FPS)
- increase training batch size from 64 to 192 (as mini-batch size of 24 on 8 GPUs)

## Performance
- mAP: 45.2% on COCO dataset (14.6% relative improvement to YOLOv4)
- real-time speed: 72.9 FPS on Tesla V100

In [None]:
# source codes not found (not reachable)

# YOLOv5

## Major Improvements
- PyTorch implementation rather than a fork from original Darknet
- CSP backbone and PA-NET neck
- mosaic data augmentation
- auto learning bounding box anchors

## Performances
- FPS: 140 (with Tesla P100)
- weights file: 27 MB (v4: 244 MB; 90% smaller than V4)

In [None]:
import torch

model = torch.hub.load('ultralytics/yolov5','yolov5m')
img = 'https://ultralytics.com/images/zidane.jpg'
results = model(img)

results.print()

Using cache found in /home/yeojinjung/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2021-9-1 torch 1.9.0+cu102 CUDA:0 (GeForce RTX 2060, 5934.5625MB)

Fusing layers... 
Model Summary: 308 layers, 21356877 parameters, 0 gradients
Adding AutoShape... 
image 1/1: 720x1280 2 persons, 1 tie
Speed: 690.9ms pre-process, 15.9ms inference, 1.3ms NMS per image at shape (1, 3, 384, 640)


In [None]:
results.show()

In [None]:
os.system("python ../../yolov5/detect.py --source 0")

[34m[1mdetect: [0mweights=yolov5s.pt, source=0, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False
[31m[1mrequirements:[0m /home/yeojinjung/Documents/yolo/yolo-realtime/requirements.txt not found, check failed.


YOLOv5 🚀 v5.0-403-gde534e9 torch 1.9.0+cu102 CUDA:0 (GeForce RTX 2060, 5934.5625MB)

Fusing layers... 
Model Summary: 224 layers, 7266973 parameters, 0 gradients


webcam 0: 0: 480x640 2 persons, 3 chairs, 1 toilet, 1 tv, 1 teddy bear, Done. (0.343s)
webcam 1: 0: 480x640 2 persons, 4 chairs, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 2: 0: 480x640 2 persons, 4 chairs, 1 toilet, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 3: 0: 480x640 2 persons, 4 chairs, 1 toilet, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 4: 0: 480x640 2 persons, 5 chairs, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 5: 0: 480x640 2 persons, 4 chairs, 1 potted plant, 1 toilet, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 6: 0: 480x640 2 persons, 4 chairs, 1 potted plant, 1 toilet, 1 tv, 1 teddy bear, Done. (0.007s)
webcam 7: 0: 480x640 2 persons, 3 chairs, 1 potted plant, 1 toilet, 1 tv, 1 teddy bear, Done. (0.007s)
webcam 8: 0: 480x640 2 persons, 4 chairs, 1 toilet, 1 tv, 1 teddy bear, Done. (0.007s)
webcam 9: 0: 480x640 2 persons, 5 chairs, 1 tv, 1 teddy bear, Done. (0.006s)
webcam 10: 0: 480x640 2 persons, 5 chairs, 1 tv, 1 teddy bear, Done. (0.007s)
webcam 11: 0: 480x640 2 persons, 1

0

In [None]:
torch.cuda.empty_cache()

In [None]:
CONFIGPATH = '../../yolov5/models/yolov5s.yaml'
WEIGHTSPATH = '../../yolov5/yolov5s.pt'


## NOTE: Architecture
- backbone: extract feature map from image
    - v3: Darknet53
    - v4, v5: CSP-Darknet
- head: locate object based on the extracted feature map
    - initialize anchor box (default box) and create final bounding box
    - 3 scales: 8 pixel, 16, 32
    - 3 anchor boxes/scale

# YOLOv4 TRAINED WITH CUSTOM DATASETS FOR RPS

## DATASET
- custom dataset of hands with 3 classes: rock, paper, scissors
- manually annotated
- 35 images in total: 30 for training (including augmentation), 5 for validation

## INFERENCE RESULT
- very poor
- identifies a lot of small patches in a frame, making it unable to detect meaningful (larger-size) objects


In [None]:
LABELSPATH = '../../darknet-alex/data/rps.names'
CONFIGPATH = '../../darknet-alex/cfg/rps.cfg'
WEIGHTSPATH = '../../darknet-alex/backup/rps_last.weights'

CONFIDENCE_THS = 0.5
NMS_THS = 0.3

In [None]:
LABELS = open(LABELSPATH).read().strip().split("\n")

np.random.seed(42)
COLORS = np.random.randint(0,255,size=(len(LABELS),3),dtype="uint8")

net = cv2.dnn.readNetFromDarknet(CONFIGPATH,WEIGHTSPATH)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

In [None]:
test_on_webcam()

[INFO] YOLO took 0.257525 secs per frame in average


In [None]:
imagefile = '../images/scissors.jpg'
image = cv2.imread(imagefile)
idxs,boxes,confidences,classIDs,duration = test_on_image(net,image)
print("[INFO] YOLO took {:.6f} secs".format(duration))
output_image = draw_outputs(idxs,boxes,confidences,classIDs,image)

cv2.imshow("",output_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(1)

[INFO] YOLO took 0.478014 secs
[INFO] YOLO predicts scissors with 0.620088 confidence


-1