In [1]:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.models import load_model
import cv2
import time
import copy
import os
from model.yolo_model import YOLO

Using TensorFlow backend.


# Yolo

In [2]:
def process_image(img):
    """Resize, reduce and expand image.

    # Argument:
        img: original image.

    # Returns
        image: ndarray(64, 64, 3), processed image.
    """
    image = cv2.resize(img, (416, 416), interpolation=cv2.INTER_CUBIC)
    image = np.array(image, dtype='float32')
    image /= 255.
    image = np.expand_dims(image, axis=0)
    return image

def get_classes(file):
    """Get classes name.

    # Argument:
        file: classes name for database.

    # Returns
        class_names: List, classes name.

    """
    with open(file) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

def draw(image, boxes, scores, classes, all_classes):
    """Draw the boxes on the image.

    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        x, y, w, h = box

        top = max(0, np.floor(x + 0.5).astype(int))
        left = max(0, np.floor(y + 0.5).astype(int))
        right = min(image.shape[1], np.floor(x + w + 0.5).astype(int))
        bottom = min(image.shape[0], np.floor(y + h + 0.5).astype(int))

        cv2.rectangle(image, (top, left), (right, bottom), (0, 255, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(all_classes[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 1,
                    cv2.LINE_AA)

        print('class: {0}, score: {1:.2f}'.format(all_classes[cl], score))
        print('box coordinate x,y,w,h: {0}'.format(box))
    print()
    
def detect_image(image, yolo, all_classes):
    """Use yolo v3 to detect images.

    # Argument:
        image: original image.
        yolo: YOLO, yolo model.
        all_classes: all classes name.

    # Returns:
        image: processed image.
    """
    pimage = process_image(image)

    start = time.time()
    boxes, classes, scores = yolo.predict(pimage, image.shape)
    end = time.time()

    print('time: {0:.2f}s'.format(end - start))

    if boxes is not None:
        draw(image, boxes, scores, classes, all_classes)

    return image

yolo = YOLO(0.6, 0.5)
file = 'data/coco_classes.txt'
all_classes = get_classes(file)



# Creating face recognition model

In [3]:
image_gen = ImageDataGenerator(rotation_range=30,
                               width_shift_range=0.1,
                               height_shift_range=0.1,
                               rescale=1/255,
                               shear_range=0.2,
                               zoom_range=0.2,
                               horizontal_flip=True,
                               fill_mode='nearest')

In [4]:
batch_size = 16
image_gen_test_face = image_gen.flow_from_directory('../Final_Project/face_images/test',
                                               target_size=(150, 150),
                                               batch_size=batch_size,
                                               class_mode='categorical')

Found 40 images belonging to 4 classes.


In [5]:
image_gen_train_face = image_gen.flow_from_directory('../Final_Project/face_images/train',
                                               target_size=(150, 150),
                                               batch_size=batch_size,
                                               class_mode='categorical')

Found 160 images belonging to 4 classes.


In [25]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Activation

In [26]:
model_face = Sequential()

model_face.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model_face.add(MaxPooling2D(pool_size=(2,2)))

model_face.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model_face.add(MaxPooling2D(pool_size=(2,2)))

model_face.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model_face.add(MaxPooling2D(pool_size=(2,2)))

model_face.add(Flatten())

model_face.add(Dense(128))
model_face.add(Activation('relu'))

model_face.add(Dropout(0.5))

model_face.add(Dense(4, activation='softmax'))

model_face.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [27]:
model_face.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 72, 72, 32)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 34, 34, 32)        9248      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 17, 17, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9248)              0         
__________

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
result = model_face.fit_generator(image_gen_train_face, epochs = 10,
                            steps_per_epoch = 100,
                            validation_data = image_gen_test_face,
                            validation_steps = 12)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
result.history['acc']

[0.893125,
 0.98625,
 0.995,
 0.9625,
 0.991875,
 0.991875,
 0.996875,
 0.9975,
 0.999375,
 0.996875]

In [31]:
model_face.save('face_recognition.h5')

# Use face recognition model

In [27]:
model_face = load_model('face_recognition.h5')

In [17]:
image_gen_train_face.class_indices

{'Abeshev K': 0, 'Adelina': 1, 'Eduard': 2, 'Evgenii': 3}

In [18]:
faces = {0: 'Kuanysh',
         1: 'Adelina',
         2: 'Eduard',
         3: 'Evgenii'}

In [9]:
for i in range(1,11):
    img = '../Final_Project/face_images/test/Eduard/' + str(i) + '.jpg'
    img = image.load_img(img,target_size=(150,150))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    img = img / 255

    pred_array = model_face.predict(img)
    print('It is', faces[np.argmax(pred_array)], 'with probability', pred_array.max())

It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0
It is Eduard with probability 1.0


In [10]:
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

In [11]:
def detect_face(img):
    face_img = img.copy()
    face_rects = face_cascade.detectMultiScale(face_img) 
    
    for (x,y,w,h) in face_rects:
        face_img = face_img[y:y+h,x:x+w]
        cv2.rectangle(face_img, (x,y), (x+w,y+h), (255,255,255), 10) 
        
    return face_img

In [12]:
def predict_face(image):
    image = np.array(image, dtype='float32')
    image /= 255
    start = time.time()
    pred_array = model_face.predict(image)
    end = time.time()
    result = faces[np.argmax(pred_array)]
    score = float("%0.2f" % (max(pred_array[0])))
    print('time: {0:.2f}s'.format(end - start))
    print(f'pred_array: {pred_array}')
    print(f'class: {result}, score: {score}')
    print()
    return result, score

# Creating images for gesture detection

In [13]:
def remove_background(frame):
    fgmask = bgModel.apply(frame, learningRate=0)
    kernel = np.ones((3, 3), np.uint8) # ядро
    fgmask = cv2.erode(fgmask, kernel, iterations=1) # размытие изображения
    res = cv2.bitwise_and(frame, frame, mask=fgmask) # вычисляет побитовое пересечение двух массивов
    gray = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0) # размытие
    ret, thresh = cv2.threshold(blur, 10, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) #вычисляет границу
    return thresh

In [None]:
cap = cv2.VideoCapture(0)
cap_region_x_begin = 0.5
cap_region_y_end = 0.5
num = 1
isBgCaptured = 0

while True:
    
    ret, frame = cap.read()
    h = frame.shape[0]
    w = frame.shape[1]
    frame = cv2.flip(frame, 1) 
    frame_copy = frame[61:int(0.5*h)+1, int(0.5*w):w]
    cv2.rectangle(frame, (int(cap_region_x_begin * frame.shape[1]), 60),(frame.shape[1], int(cap_region_y_end * frame.shape[0])), (0, 255, 0), 2)
    
    if isBgCaptured == 1:
        frame_copy = remove_background(frame_copy)
        frame[61:int(0.5*h)+1, int(0.5*w):w,0] = frame_copy
        frame[61:int(0.5*h)+1, int(0.5*w):w,1] = frame_copy
        frame[61:int(0.5*h)+1, int(0.5*w):w,2] = frame_copy
        
    cv2.imshow("original", frame)
    
    k = cv2.waitKey(10)
    if k == 27:
        break
    elif k == ord('b'):
        bgModel = cv2.createBackgroundSubtractorMOG2(0, 50)
        isBgCaptured = 1
    elif k == 32:
        direc = "../Final_Project/gest_images/" + str(num) + '.jpg'
        cv2.imwrite(direc, frame_copy)
        cv2.imshow("save image", frame_copy)
        num += 1
        
cap.release()        
cv2.destroyAllWindows()

# Creating gesture detection model

In [19]:
image_gen = ImageDataGenerator(rotation_range=30,
                               width_shift_range=0.1,
                               height_shift_range=0.1,
                               rescale=1/255,
                               shear_range=0.2,
                               zoom_range=0.2,
                               horizontal_flip=True,
                               fill_mode='nearest')

In [20]:
batch_size = 16
image_gen_test = image_gen.flow_from_directory('../Final_Project/gest_images/test',
                                               target_size=(150, 150),
                                               batch_size=batch_size,
                                               class_mode='categorical')

Found 50 images belonging to 5 classes.


In [21]:
image_gen_train = image_gen.flow_from_directory('../Final_Project/gest_images/train',
                                               target_size=(150, 150),
                                               batch_size=batch_size,
                                               class_mode='categorical')

Found 200 images belonging to 5 classes.


In [5]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Activation

In [6]:
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=(150,150,3),activation='relu',))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())

model.add(Dense(128))
model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [7]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 72, 72, 32)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 34, 34, 32)        9248      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 17, 17, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9248)              0         
__________

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
result = model.fit_generator(image_gen_train, epochs = 10,
                            steps_per_epoch = 100,
                            validation_data = image_gen_test,
                            validation_steps = 12)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
result.history['acc']

[0.38406735751295334,
 0.5559895833333334,
 0.6764322916666666,
 0.7252604166666666,
 0.759765625,
 0.8011658031088082,
 0.8274739583333334,
 0.8235677083333334,
 0.8484455958549223,
 0.8704427083333334]

In [11]:
model.save('gesture_recognition.h5')

# Use Model

In [14]:
model = load_model('gesture_recognition.h5')

In [22]:
image_gen_train.class_indices

{'five': 0, 'like': 1, 'okay': 2, 'peace': 3, 'rock': 4}

In [23]:
gesture_names = {0: 'five',
                 1: 'like',
                 2: 'okay',
                 3: 'peace',
                 4: 'rock'}

In [24]:
for i in range(1,11):
    img = '../Final_Project/gest_images/test/rock/' + str(i) + '.jpg'
    img = image.load_img(img,target_size=(150,150))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    img = img / 255

    pred_array = model.predict(img)
    print('It is', gesture_names[np.argmax(pred_array)], 'with probability', pred_array.max())

It is rock with probability 0.9654696
It is rock with probability 0.96290535
It is rock with probability 0.98637295
It is rock with probability 0.94601667
It is rock with probability 0.7545623
It is peace with probability 0.66241014
It is rock with probability 0.5782857
It is rock with probability 0.86349607
It is rock with probability 0.73312294
It is rock with probability 0.53770334


In [25]:
def predict_image(image):
    image = np.array(image, dtype='float32')
    image /= 255
    start = time.time()
    pred_array = model.predict(image)
    end = time.time()
    result = gesture_names[np.argmax(pred_array)]
    score = float("%0.2f" % (max(pred_array[0])))
    print('time: {0:.2f}s'.format(end - start))
    print(f'pred_array: {pred_array}')
    print(f'class: {result}, score: {score}')
    print()
    return result, score

In [28]:
cap = cv2.VideoCapture(0)
num = 1
isBgCaptured = 0

while True:
    
    ret, frame = cap.read()
    h = frame.shape[0]
    w = frame.shape[1]
    frame = cv2.flip(frame, 1)
    frame_copy = frame[60:int(0.5*h), int(0.5*w):w]
    frame_copy2 = frame[240:420, int(0.5*w):w]
    frame_copy3 = frame[60:420, 0:int(0.5*w)]
    cv2.rectangle(frame, (int(0.5 * w), 60),(w, int(0.5 * h)), (0, 255, 0), 2)
    cv2.rectangle(frame, (int(0.5 * w), 240),(w, 420), (0, 0, 255), 2)
    cv2.rectangle(frame, (0, 60), (int(0.5 * w), 420), (255, 0, 0), 2)
    
    if isBgCaptured == 1:
        frame_copy = remove_background(frame_copy)
        frame[61:int(0.5*h)+1, int(0.5*w):w,0] = frame_copy
        frame[61:int(0.5*h)+1, int(0.5*w):w,1] = frame_copy
        frame[61:int(0.5*h)+1, int(0.5*w):w,2] = frame_copy
    
    cv2.imshow("original", frame)
    
    k = cv2.waitKey(10)
    if k == 27:
        break
    elif k == ord('b'):
        bgModel = cv2.createBackgroundSubtractorMOG2(0, 50) # function that recognizes moving objects and calculate background
        isBgCaptured = 1
    elif k == ord('g') and isBgCaptured == 1:
        target = np.stack((frame_copy,) * 3, axis=-1)
        target = cv2.resize(target, (150, 150))
        target = target.reshape(1, 150, 150, 3)
        prediction, score = predict_image(target)
        res = cv2.putText(frame_copy, f"{prediction} : {score}", (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255, 255, 255))
        frame_copy = cv2.add(frame_copy,res)
        cv2.imshow('gest_detection', frame_copy)
    elif k == ord('o'):
        frame_copy2 = detect_image(frame_copy2, yolo, all_classes)
        cv2.imshow("object_detection", frame_copy2)
    elif k == ord('f'):
        target = cv2.resize(frame_copy3, (150,150))
        target = image.img_to_array(target)
        target = np.expand_dims(target,axis=0)
        prediction, score = predict_face(target)
        res = cv2.putText(frame_copy3, f"{prediction} : {score}", (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255, 255, 255))
        frame_copy3 = cv2.add(frame_copy3,res)
        cv2.imshow('face_detection', frame_copy3)
        
cap.release()        
cv2.destroyAllWindows()

time: 0.36s
pred_array: [[6.63916653e-05 1.00941464e-01 6.44835532e-01 2.54156619e-01]]
class: Eduard, score: 0.64

time: 0.00s
pred_array: [[1.6125650e-04 2.4863054e-01 6.0382217e-01 1.4738601e-01]]
class: Eduard, score: 0.6

time: 0.00s
pred_array: [[6.5308479e-05 1.3240877e-01 6.5089619e-01 2.1662971e-01]]
class: Eduard, score: 0.65

time: 0.00s
pred_array: [[1.2297514e-04 2.9695520e-01 5.6289995e-01 1.4002186e-01]]
class: Eduard, score: 0.56

time: 0.00s
pred_array: [[0.00226959 0.32725343 0.62314737 0.04732965]]
class: Eduard, score: 0.62

time: 0.00s
pred_array: [[0.00189731 0.28763607 0.6603198  0.05014683]]
class: Eduard, score: 0.66

time: 0.00s
pred_array: [[0.00063829 0.40889904 0.55101675 0.03944592]]
class: Eduard, score: 0.55

time: 0.00s
pred_array: [[1.8172902e-04 2.3327929e-01 7.0885485e-01 5.7684142e-02]]
class: Eduard, score: 0.71

time: 0.00s
pred_array: [[1.4298796e-04 6.9481574e-02 8.8091338e-01 4.9462002e-02]]
class: Eduard, score: 0.88

time: 0.00s
pred_array: [