<a href="https://colab.research.google.com/github/z-arabi/SRU-deeplearning-workshop/blob/master/15_using-a-pretrained-convnet-webcam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
! git clone https://github.com/z-arabi/SRU-deeplearning-workshop
%cd SRU-deeplearning-workshop

Cloning into 'SRU-deeplearning-workshop'...
remote: Enumerating objects: 302, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 302 (delta 38), reused 81 (delta 36), pack-reused 213[K
Receiving objects: 100% (302/302), 37.24 MiB | 25.49 MiB/s, done.
Resolving deltas: 100% (137/137), done.
/content/SRU-deeplearning-workshop


# Webcam

In [1]:
# image processing library for python
import cv2

In [2]:
# check the video source is available or not
cv2.VideoCapture(0)

< cv2.VideoCapture 0x7ed935a70a70>

In [None]:
'''the input:
1. video url / video path
2. int > the number of webcam of laptop
3. write the rtsp url to connect to camera
'''
cap = cv2.VideoCapture(0)
frameShape = None
returnCode = None

while True:
    # read one frame > ret show that the reading was done successfully or not
    ret, frame = cap.read()
    # ret i a binary variable > shows whether it was successful to read the frame
    # frame is numpy array H*W*#channels
    cv2.imshow('Webcam', frame)
    # 13 is the int code for Enter
    if cv2.waitKey(1) == 13:
        returnCode = ret
        frameShape = frame.shape
        break

# Release camera and close the windows
# The webcam is an exclusive resource that must be released
cap.release()
cv2.destroyAllWindows()
print(returnCode)
print(frameShape)

In [5]:
# frame is an image > we can access many features
# print(frame.shape)

# MobileNet


In [6]:
'''
MobileNet is a deep-learning computer vision model architecture that is
designed for efficient and lightweight computation on mobile devices.
It was developed by Google researchers in 2017 and is used for a variety
of computer vision tasks such as image classification, object detection,
and face recognition.
'''
from tensorflow.keras.applications import MobileNetV2
from keras.preprocessing import image
from keras.applications.mobilenet_v2 import preprocess_input, decode_predictions
import numpy as np

In [7]:
model = MobileNetV2(weights='imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5


In [9]:
model.summary()

Model: "mobilenetv2_1.00_224"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 Conv1 (Conv2D)              (None, 112, 112, 32)         864       ['input_1[0][0]']             
                                                                                                  
 bn_Conv1 (BatchNormalizati  (None, 112, 112, 32)         128       ['Conv1[0][0]']               
 on)                                                                                              
                                                                                                  
 Conv1_relu (ReLU)           (None, 112, 112, 32)         0         ['bn_Conv1[

In [19]:
model.input

<KerasTensor: shape=(None, 224, 224, 3) dtype=float32 (created by layer 'input_1')>

In [27]:
def inference(x):
    # smaples * input shape
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    preds = model.predict(x)
    # preds is (1, 1000) > decode > assign key to the max pred
    # print(preds.shape)
    return decode_predictions(preds, top=1)

In [28]:
img_path = './image/elephant.jpg'

#  input_1 (InputLayer)        [(None, 224, 224, 3)]
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)

In [38]:
x = image.img_to_array(img)
print(x.shape)
x = np.expand_dims(x, axis=0)
print(x.shape)
print(x[0,0,0,:])
# normalize > between 0 and 1
x = preprocess_input(x)
print(x[0,0,0,:])

preds = model.predict(x)
# preds is (1, 1000) > decode > assign key to the max pred
print(preds.shape)
print(np.argmax(preds))
print(np.max(preds))

(224, 224, 3)
(1, 224, 224, 3)
[174. 189. 220.]
[0.36470592 0.48235297 0.7254902 ]
(1, 1000)
386
0.5015807


In [40]:
x = image.img_to_array(img)
inference(x)



[[('n02504458', 'African_elephant', 0.5015807)]]

In [49]:
img = image.load_img(img_path)
img = image.img_to_array(img)
print(img.shape)
x = cv2.resize(x,(224,224))
print(x.shape)

print(x[0,0,:])
print(x[0,0,::-1])

(425, 640, 3)
(224, 224, 3)
[0.36470592 0.48235297 0.7254902 ]
[0.7254902  0.48235297 0.36470592]


In [None]:
import cv2
cap = cv2.VideoCapture(0)
while True:
    ret, frame = cap.read()
    frame = cv2.resize(frame, (224, 224))

    # OpenCV usually reads frames in BGR format
    predicted = inference(frame[...,::-1])
    predictd = predicted[0][0][1]

    # (30, 30): The position where the text will start
    # 1.0: The font scale factor
    # (0, 0, 255): The color of the text
    cv2.putText(frame,  predicted, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), lineType=cv2.LINE_AA)

    cv2.imshow('Webcam', frame)
    if cv2.waitKey(1) == 13: #13 is the Enter Key
        break

cap.release()
cv2.destroyAllWindows()