cfal.py

from statistics import mode
import imutils
import cv2
import numpy as np
from imutils.video import VideoStream
import time

from preprocessor import preprocess_input

from imutils.face_utils import FaceAligner
from imutils.face_utils import rect_to_bb
import dlib


shape_predictor = "shape_predictor_68_face_landmarks.dat"
predictor = dlib.shape_predictor(shape_predictor)

from mtcnn.mtcnn import MTCNN
detector = MTCNN()


# Support functions

def get_labels(dataset_name):
    if dataset_name == 'KDEF':
        return {0: 'AN', 1: 'DI', 2: 'AF', 3: 'HA', 4: 'SA', 5: 'SU', 6: 'NE'}
    else:
        raise Exception('Invalid dataset name')


def detect_faces(detection_model, gray_image_array, conf):
    frame = gray_image_array
    # Grab frame dimention and convert to blob
    (h,w) =  frame.shape[:2]
    # Preprocess input image: mean subtraction, normalization
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0,
    (300, 300), (104.0, 177.0, 123.0))
    # Set read image as input to model
    detection_model.setInput(blob)

    # Run forward pass on model. Receive output of shape (1,1,no_of_predictions, 7)
    predictions = detection_model.forward()
    coord_list = []
    count = 0
    for i in range(0, predictions.shape[2]):
        confidence = predictions[0,0,i,2]
        if confidence > conf:
            # Find box coordinates rescaled to original image
            box_coord = predictions[0,0,i,3:7] * np.array([w,h,w,h])
            conf_text = '{:.2f}'.format(confidence)
            # Find output coordinates
            xmin, ymin, xmax, ymax = box_coord.astype('int')
            coord_list.append([xmin, ymin, (xmax-xmin), (ymax-ymin)])
            
        print('Coordinate list:', coord_list)

    return coord_list


def draw_text(coordinates, image_array, text, color, x_offset=0, y_offset=0,
                                                font_scale=2, thickness=2):
    x, y = coordinates[:2]
    cv2.putText(image_array, text, (x + x_offset, y + y_offset),
                cv2.FONT_HERSHEY_SIMPLEX,
                font_scale, color, thickness, cv2.LINE_AA)


def draw_bounding_box(face_coordinates, image_array, color, identity):
    x, y, w, h = face_coordinates
    cv2.rectangle(image_array, (x, y), (x + w, y + h), color, 2)
    cv2.putText(image_array, str(identity), (x+5,y-5), font, 1, (255,255,255), 2)


def apply_offsets(face_coordinates, offsets):
    x, y, width, height = face_coordinates
    x_off, y_off = offsets
    return (x - x_off, x + width + x_off, y - y_off, y + height + y_off)


def load_detection_model(prototxt, weights):
    detection_model = cv2.dnn.readNetFromCaffe(prototxt, weights)
    return detection_model


# parameters for loading data and images
prototxt = 'trained_models/deploy.prototxt.txt'
weights = 'trained_models/res10_300x300_ssd_iter_140000.caffemodel'

font = cv2.FONT_HERSHEY_SIMPLEX

frame_window = 10
face_offsets = (30, 40)
emotion_offsets = (20, 40)
confidence = 0.6

# loading models
face_detection = load_detection_model(prototxt, weights)
# face_detection_size = (40, 40)
counter = 0
# frame_process_counter = 0

def crop_face(file_name, face_detection, name_count):

    face_detection_size = (40, 40)
    counter = 0
    frame_process_counter = 0

    # starting video streaming
    cv2.namedWindow('Attendence_Tracker', cv2.WINDOW_NORMAL)
    # cv2.namedWindow('Attendence_Tracker')
    # file_name = '../top10/person1.mp4'
    video_capture = cv2.VideoCapture(file_name)

    time.sleep(1.0)

    while (video_capture.isOpened()):
        ret, bgr_image = video_capture.read()
        if ret == False:
            break
        counter += 1
        if counter % 1 == 0:
            frame_process_counter += 1
            gray_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
            rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
            faces = detect_faces(face_detection, bgr_image,confidence)
            count = 0
            for face_coordinates in faces:
                x1, x2, y1, y2 = apply_offsets(face_coordinates, face_offsets)
                rgb_face = rgb_image[y1:y2, x1:x2]
        
                print("len", len(rgb_face))
                # print(rgb_face)
                if len(rgb_face) != 0 and counter % 1 ==0:
                    print(detector.detect_faces(rgb_face))
                    dict_mtcnn =  detector.detect_faces(rgb_face)
                    if len(dict_mtcnn) != 0:
                        bounding_box = dict_mtcnn[0]['box']
                        new_image = rgb_image[bounding_box[2]:bounding_box[3], bounding_box[0]:bounding_box[1]]
                        cv2.rectangle(new_image,
                          (bounding_box[0], bounding_box[1]),
                          (bounding_box[0]+bounding_box[2], bounding_box[1] + bounding_box[3]),
                          (0,155,255), 2)

                        # cv2.imwrite("align/align_{}/align_{}_{}".format(name_count, name_count,counter) + ".jpg", cv2.cvtColor(img, cv2.COLOR_RGB2BGR))    
                    cv2.imwrite("align/emp_{}/emp_{}_{}".format(name_count, name_count,counter) + ".jpg", cv2.cvtColor(rgb_face, cv2.COLOR_RGB2BGR))
                    print("image saved-------------------", counter)              
                count += 1
                try:
                    rgb_face = cv2.resize(rgb_face, (face_detection_size))
                except:
                    continue
                rgb_face = np.expand_dims(rgb_face, 0)
                rgb_face = preprocess_input(rgb_face, False)

                # Bounding box color            
                color = (255, 0, 0)
                identity = "this is me"
                draw_bounding_box(face_coordinates, rgb_image, color, identity)
            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
            cv2.imshow('Attendence_Tracker', bgr_image)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            print('Total frames processed:', counter, frame_process_counter)
            break
    video_capture.release()
    # out.release()
    cv2.destroyAllWindows()

    return "successful"