## install needed libraries

## import libraries

In [None]:
import cv2
import argparse, os, random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from colour import Color
import easydict
from IPython.display import clear_output
import mediapipe as mp

import torch.nn as nn
import torch
import math

import os 
from numpy.lib.function_base import average

## Deep eye contact model 
Deep neural network trained to detect eye contact from facial image

In [None]:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

def model_static(pretrained=False, **kwargs):
    model = ResNet([3, 4, 6, 3], **kwargs)
    if pretrained:
        print ('loading saved model weights')
        model_dict = model.state_dict()
        snapshot = torch.load(f = pretrained, map_location=torch.device('cuda'))
        snapshot = {k: v for k, v in snapshot.items() if k in model_dict}
        model_dict.update(snapshot)
        model.load_state_dict(model_dict)
    return model


class ResNet(nn.Module):
    def __init__(self, layers):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size = 7, stride = 2, padding = 3,
                               bias = False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace = True)
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        self.layer1 = self._make_layer(64, layers[0])
        self.layer2 = self._make_layer(128, layers[1], stride=2)
        self.layer3 = self._make_layer(256, layers[2], stride=2)
        self.layer4 = self._make_layer(512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7, stride = 1)
        self.fc_theta = nn.Linear(512 * Bottleneck.expansion, 34)
        self.fc_phi = nn.Linear(512 * Bottleneck.expansion, 34)
        self.fc_ec = nn.Linear(512 * Bottleneck.expansion, 1)
        self.init_param()

    def init_param(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2./n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.shape[0] * m.weight.shape[1]
                m.weight.data.normal_(0, math.sqrt(2./n))
                m.bias.data.zero_()

    def _make_layer(self, planes, blocks, stride = 1):
        downsample = None
        layers = []

        if stride != 1 or self.inplanes != planes * Bottleneck.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * Bottleneck.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * Bottleneck.expansion),
                )

        layers.append(Bottleneck(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * Bottleneck.expansion
        for i in range(1, blocks):
            layers.append(Bottleneck(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc_ec(x)

        return x

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


## the main method 


In [None]:
def detect_eye_contact(video_path, model_weight, vis, display_off, save_text , output_path):
    """
    input: 
    video_path: The video path you want
    model_weight: the path of this "model_weights.pkl"
    vis: True or False to save output video
    display_off: True or False to display output video
    save_text: True or False to save output file contains all score of frames
    output_path: the path for save output file 

    output:
    score: score of eye contact
    confidence: confidence of the taken score

    """

    num_person = video_path.split('/')[-1].split(" ")[1].split("-")[0]
    name_class = video_path.split('-')[-1].split(".")[0]

    #for using face detection from mediapip library
    mp_face_detection = mp.solutions.face_detection

    # set up vis settings
    red = Color("red")
    colors = list(red.range_to(Color("green"),10))
    font = ImageFont.truetype("/content/drive/MyDrive/data/arial.ttf", 40)
    frames = []

    # O is set contains list of y and confidence for every frame
    # y is 1 if exist eye contact in frame and 0 if not 
    # confidence is the confidence in frame 
    O = {"y":[] , "confidence" :[]}
    # k is list contains confidence then more 50%
    K = []

    # set up video source
    if video_path is None:
        print("we don't have video for display")
    else:
        cap = cv2.VideoCapture(video_path)

    # set up output file
    if save_text:
        outtext_name = os.path.basename(video_path).replace('.mp4','_output.csv')
        # folder = video_path.split('/')[-1].split(" ")[1].split("-")[0]
        folder = video_path.split('/')[-2].split("/")[0]
        path = os.path.join(output_path, folder)
        if os.path.isdir(path):
          print(path)
        else:
          os.mkdir(path)
        outtext_name = os.path.join(path, outtext_name)
        print(outtext_name)
        f = open(outtext_name, "w")

    if vis:
        outvis_name = os.path.basename(video_path).replace('.mp4','_output.avi')
        outvis_name = os.path.join(path, outvis_name)

        # We need to set resolutions.
        # so, convert them from float to integer.
        imwidth = int(cap.get(3)); imheight = int(cap.get(4))
      
       
        # Below VideoWriter object will create
        # a frame of above defined The output 
        # is stored in outvis file.
        outvid = cv2.VideoWriter(outvis_name,cv2.VideoWriter_fourcc(*'MJPG'), 10, (imwidth,imheight))

  
    if (cap.isOpened()== False):
        print("Error opening video stream or file")
        exit()

    frame_cnt = 0

    # set up data transformation
    test_transforms = transforms.Compose([transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

    # load model weights
    model = model_static(model_weight)
    model_dict = model.state_dict()
    snapshot = torch.load(model_weight, map_location=torch.device('cuda'))
    # snapshot = torch.load(model_weight)
    model_dict.update(snapshot)
    model.load_state_dict(model_dict)

    model.cuda()
    model.train(False)

    # video reading loop
    while(cap.isOpened()):
        with mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5) as face_detection:

          ret, frame = cap.read()
          if ret == True:
              height, width, channels = frame.shape
              frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

              frame_cnt += 1
              bbox = []

          #instead of Dlib 
              results = face_detection.process(frame)

              if results.detections:
                for detection in results.detections:

                  results.detections[0].location_data.relative_bounding_box.xmin

                  l , t = normaliz_pixel(detection.location_data.relative_bounding_box.xmin ,detection.location_data.relative_bounding_box.ymin , width, height )
                  r = detection.location_data.relative_bounding_box.width*width + l
                  b = detection.location_data.relative_bounding_box.height*height + t

                  l -= (r-l)*0.2
                  r += (r-l)*0.2
                  t -= (b-t)*0.2
                  b += (b-t)*0.2

                  bbox.append([l,t,r,b])
                
            
              frame = Image.fromarray(frame)
              for b in bbox:
                  face = frame.crop((b))
                  img = test_transforms(face)
                  img.unsqueeze_(0)
                  
                  # forward pass
                  output = model(img.cuda())

                  score = F.sigmoid(output).item()

                  # 0 -- 1 
                  # 0.9 > eye contact 
                  # 0.9 < without eye contact

                  O["confidence"].append(score)
                  y = 1 if score>=0.9 else 0
                  O["y"].append(y)
                  if score>0.5:
                    K.append(score)

                  coloridx = min(int(round(score*10)),9)
                  draw = ImageDraw.Draw(frame)
                  draw_rectangle(draw, [(b[0], b[1]), (b[2], b[3])], outline=colors[coloridx].hex, width=5)
                  draw.text((b[0],b[3]), str(round(score,2)), fill=(255,255,255,128), font=font)
                  if y == 1:
                    draw.text((10, 60), "eye contact", font=font, fill=(255, 255, 255, 255))
                  else:
                    draw.text((10, 60), "without eye contact", font=font, fill=(255, 255, 255, 255))
                  if save_text:
                      # we will store number frame and decision with eye contact or not
                      f.write("%d,%f\n"%(frame_cnt,y))

              
              frame = np.asarray(frame) # convert PIL image back to opencv format for faster display
              frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                  # dim = (600, 335)
                  # resized=cv2.resize(frame,dim)
              if not display_off:
                  clear_output(wait=True)
                  # cv2_imshow(resized)
                  
               # Display on colab
                  cv2_imshow(frame)

               # Display on Jupyter
                  # cv2.imshow('',frame)
                  
                  key = cv2.waitKey(1) & 0xFF
                  if key == ord('q'):
                      break
              if vis:
                  outvid.write(frame)
                  
          else:
              break

    confidence = len(K)/len(O["y"]) 
    score = average(K)
    if score>0.85 and confidence>0.75:
      final_result = 1
    else:
      final_result = 0

    if save_text:
        f.write("score of eye contact: %f ,confidence : %f\n"%(score,confidence))
        print(video_path,"score of eye contact: %f ,confidence : %f\n"%(average(K),confidence))
        f.close()

    cap.release()
    if vis:
        outvid.release()
    print ('DONE!')

    if num_person in the_results["number person"]:
        print("exist")
    else:
      the_results["number person"].append(num_person)
    # the_results[name_class][0].append(average(K))
    # the_results[name_class][1].append(confidence)
    
    return final_result

## to run

In [None]:
the_results = {"number person":[] ,
               "A: score": [],"A: confidence": [],
               "B: score": [],"B: confidence": [],
               "C: score": [],"C: confidence": [],
               "D: score": [],"D: confidence": [],}

# pass vedio path 
groupC = '/content/drive/MyDrive/dataset of presentation scoring/Presenter No1-Asylah/C_presenter No1-groupB.mp4'

# args = parser.parse_args()
# save text: will be false or true

args = easydict.EasyDict({
    "video": groupC,
    "model_weight": "/content/drive/MyDrive/data/model_weights.pkl",
    "save_vis": False,
    "save_text": True,
    "display_off": True 
})


Execute on one video


In [None]:
detect_eye_contact(args.video,args.model_weight, args.save_vis, args.display_off, args.save_text , "/content/drive/MyDrive/The results of eye contact")

Execute on many videos


In [None]:
# for run and processing more than one video simutanously in videos:

videos = get_videos("/content/drive/MyDrive/dataset of presentation scoring")
for video in videos:
  ##
  ## this code to prevent execution again
  outtext_name = os.path.basename(video).replace('.mp4','_output.csv')
  folder = video.split('/')[-2].split("/")[0]
  output_path = '/content/drive/MyDrive/The results of eye contact/supervised'
  path = os.path.join(output_path, folder)
  outtext_name = os.path.join(path, outtext_name)
  print(outtext_name)
  if os.path.isfile(outtext_name):
    print("already is runing")
    pass
  ##
  else:
    detect_eye_contact(video,args.model_weight, args.save_vis, args.display_off, args.save_text ,
                     "/content/drive/MyDrive/The results of eye contact/supervised")

In [None]:

# the path of the dataset on the Drive

data_path = "/content/drive/MyDrive/Eye contact data "
folders ,videos = get_videos(data_path)
print(folders)
videos

In [None]:
# convert the results dictionary to dataframe 
df = pd.DataFrame(the_results)
import os  
os.makedirs('/content/drive/MyDrive/', exist_ok=True)  
df.to_csv('/content/drive/MyDrive/result_of_eye_contact.csv')  


In [None]:
import cv2
import time
import mediapipe as mp
mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils

# For static images:
# IMAGE_FILES = []
# with mp_face_detection.FaceDetection(
#     model_selection=1, min_detection_confidence=0.5) as face_detection:
#   for idx, file in enumerate(IMAGE_FILES):
#     image = cv2.imread(file)
#     # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
#     results = face_detection.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

#     # Draw face detections of each face.
#     if not results.detections:
#       continue
#     annotated_image = image.copy()
#     for detection in results.detections:
#       print('Nose tip:')
#       print(mp_face_detection.get_key_point(
#           detection, mp_face_detection.FaceKeyPoint.NOSE_TIP))
#       mp_drawing.draw_detection(annotated_image, detection)
#     cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', annotated_image)


# For webcam input:
cap = cv2.VideoCapture("/content/drive/MyDrive/test_eye_c.mp4")
with mp_face_detection.FaceDetection(
    model_selection=0, min_detection_confidence=0.5) as face_detection:
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue

    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = face_detection.process(image)

    print(results)
    # Draw the face detection annotations on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    img_h, img_w=image.shape[:2]

    # mesh_points =np.array([np.multiply([p.x,p.y],[img_w,img_h]).astype(int) for p in results.multi_face_landmarks[0].landmark])
    if results.detections:
      print(results.detections[0].location_data.relative_bounding_box.xmin)
      time.sleep(0.5)
      for detection in results.detections:
        mp_drawing.draw_detection(image, detection)
    # Flip the image horizontally for a selfie-view display.
    # clear_output(wait=True)
    # cv2_imshow(image)
    # cv2.imshow('MediaPipe Face Detection', cv2.flip(image, 1))
    if cv2.waitKey(5) & 0xFF == 27:
      break
cap.release()