In [1]:
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time
import torch
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
        js_reply: JavaScript object containing image from webcam
  Returns:
        img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
        bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [4]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "user"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [5]:
# Model
obj_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
hand_model = torch.hub.load('ultralytics/yolov5', 'custom', path='drive/MyDrive/Colab Notebooks/best.pt')

Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /root/.cache/torch/hub/master.zip
[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...
Collecting PyYAML>=5.3.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed PyYAML-6.0

[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-4-27 torch 1.11.0+cu113 CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt to yolov5s.pt...


  0%|          | 0.00/14.1M [00:00<?, ?B/s]


Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...

[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-4-27 torch 1.11.0+cu113 CPU

Fusing layers... 
Model summary: 213 layers, 7012822 parameters, 0 gradients
Adding AutoShape... 


In [98]:
# Some functions used in next cell

def obj_detect(img):
    """
    Params:
          img: OpenCV BGR image
    Returns:
          res: pandas dataframe containing all objects detected by the model
    """
    results = obj_model(img)
    res = results.pandas().xyxy[0]  # img1 predictions (pandas)
    return res

def get_persons(objects):
    """
    Params:
          objects: pandas dataframe containing different objects
    Returns:
          persons: pandas dataframe containing only the objects labeled as person
    """
    persons = objects.loc[objects['name'] == "person"]
    idx_del = []
    areas = []

    for idx, pers in persons.iterrows():
        dx = pers["xmax"]-pers["xmin"]
        dy = pers["ymax"]-pers["ymin"]
        areas.append(dx*dy)
        if(areas[-1] < 10000):
            idx_del.append(idx)
    persons = persons.drop(idx_del)


    return persons, areas

def get_personOfInterest(persons, hand, persons_area):  # returns None if rectangles don't intersect
    """
    Params:
          persons: pandas dataframe containing only objects labeled as person
          hand: pandas dataframe containing only the detected hands (labeled as 5)
    Returns:
          idx_max: integer corresponding to the index in the persons dataframe that corresponds to the person of interest
                   is equal to None if no hand is being detected or the hand does not belongs to any of the detectet persons
    """
    margin = 250
    idx_max = 0
    area_max = 0
    a_min = [-1,-1]
    a_max = [-1,-1]
    if not hand.empty:
        for idx, pers in persons.iterrows():
            dx = np.minimum(pers["xmax"], hand["xmax"].iloc[0]) - np.maximum(pers["xmin"], hand["xmin"].iloc[0])
            dy = np.minimum(pers["ymax"], hand["ymax"].iloc[0]) - np.maximum(pers["ymin"], hand["ymin"].iloc[0])
            if(dx<0 and dy<0):
                area = -dx*dy
            else:
                area = dx*dy
            if (area >= area_max - margin and persons_area[idx] >= persons_area[idx_max]):
                area_max = area
                idx_max = idx
                # Debug
                a_min = [int(np.maximum(pers["xmin"], hand["xmin"].iloc[0])),int(np.maximum(pers["ymin"], hand["ymin"].iloc[0]))]
                a_max = [int(np.minimum(pers["xmax"], hand["xmax"].iloc[0])),int(np.minimum(pers["ymax"], hand["ymax"].iloc[0]))]

    if (area_max>0):
        return idx_max, a_min, a_max
    else:
        return None, a_min, a_max

def hand_detect(img):
    """
    Params:
          img: OpenCV BGR image
    Returns:
          res: pandas dataframe containing all objects detected by the model
    """
    results = hand_model(img)
    res = results.pandas().xyxy[0]  # img1 predictions (pandas)
    return res

In [99]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # detect all objects
    obj = obj_detect(img)
    # keep only the persons
    pers, persons_area = get_persons(obj)
    print(pers)
    print(persons_area)
    # detect hand
    hand = hand_detect(img)
### For testing purpose only
    #d = {'xmin': [200], 'ymin': [200], 'xmax': [240], 'ymax': [240]}
    #hand = pd.DataFrame(data=d)
###

    # identify person of interest
    idx_max, area_min, area_max = get_personOfInterest(pers,hand, persons_area)
    
    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)
    
    # draw bounding box of the hand (if multiple hands, just the first one)
    if (not hand.empty) and area_min!=None:
        bbox_array = cv2.rectangle(bbox_array,(int(hand["xmin"].iloc[0]),int(hand["ymin"].iloc[0])), (int(hand["xmax"].iloc[0]),int(hand["ymax"].iloc[0])), (0, 0, 255), 2)
        bbox_array = cv2.rectangle(bbox_array,tuple(area_min), tuple(area_max), (255, 0, 255), 2)

    # draw bounding boxes on overlay
    for index, row in pers.iterrows():
        start_point = (int(row["xmin"]), int(row["ymin"]))
        end_point = (int(row["xmax"]), int(row["ymax"]))
        if (index == idx_max):
            color = (0, 255, 0)
        else:
            color = (255, 0, 0)
        thickness = 2
        bbox_array = cv2.rectangle(bbox_array,start_point, end_point, color, thickness)

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

<IPython.core.display.Javascript object>

[193548.92161812168]
[124663.61263692565]
[147178.212287738]
[167671.1251984071]
[142767.2461077869]
[142767.2461077869, 47315.078148115426]
[142767.2461077869, 47315.078148115426, 20903.29401770234]
[148237.90621415921]
[148237.90621415921, 53572.590200970415]
[50137.307799832895]
[50137.307799832895, 130437.38409879617]
[48210.60757109709]
[48210.60757109709, 128741.60177532211]
[43473.03816202283]
[43473.03816202283, 124856.54192862008]
[44506.913145273924]
[44506.913145273924, 125762.2323899623]
[43091.625382812694]
[43091.625382812694, 134996.8605277061]
[43091.625382812694, 134996.8605277061, 99500.37147629075]
[41926.25859975815]
[41926.25859975815, 137342.3215902471]
[41926.25859975815, 137342.3215902471, 88244.5801719781]


IndexError: ignored