# **Task 1 | Mask Recognition**

***Goal :*** *Detect human faces on videos and check whether or not they have a mask on*. 

In this notebook we implement two different models to perform the task :
- [Faster-RCNN (ResNet50)](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html)

___
___

## **1. INITIALIZATION**

### *1.1 IMPORTS*

In [1]:
from IPython.display import display, clear_output
import cv2
import os
import pandas as pd
import random as rd

from tools import engine, utils

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

import torch
import torchvision

In [2]:
# to fill the `requirement.txt` file we use the following line of code:
import session_info
session_info.show()

In [3]:
torch.cuda.empty_cache()

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

device: cuda


### *1.3. DATA LOADING*

In [4]:
!python ./DataPreprocessing.py
# data preprocessing
clear_output()

 36%|███▌      | 303/853 [00:07<00:09, 57.12it/s]
 36%|███▌      | 309/853 [00:07<00:11, 48.18it/s]
 37%|███▋      | 315/853 [00:07<00:10, 49.17it/s]
 38%|███▊      | 321/853 [00:07<00:11, 45.64it/s]
 38%|███▊      | 326/853 [00:07<00:12, 42.19it/s]
 39%|███▉      | 332/853 [00:07<00:11, 46.33it/s]
 40%|███▉      | 338/853 [00:07<00:10, 47.05it/s]
 40%|████      | 344/853 [00:07<00:10, 49.35it/s]
 41%|████      | 350/853 [00:08<00:10, 48.10it/s]
 42%|████▏     | 357/853 [00:08<00:09, 52.82it/s]
 43%|████▎     | 363/853 [00:08<00:09, 49.17it/s]
 43%|████▎     | 369/853 [00:08<00:09, 49.65it/s]
 44%|████▍     | 375/853 [00:08<00:09, 49.51it/s]
 45%|████▍     | 381/853 [00:08<00:09, 49.05it/s]
 45%|████▌     | 386/853 [00:08<00:10, 44.69it/s]
 46%|████▌     | 392/853 [00:08<00:09, 46.77it/s]
 47%|████▋     | 397/853 [00:09<00:11, 39.50it/s]
 47%|████▋     | 402/853 [00:09<00:11, 38.21it/s]
 48%|████▊     | 408/853 [00:09<00:10, 42.10it/s]
 49%|████▊     | 414/853 [00:09<00:09, 45.81it/s]


In [5]:
data_dir_path = "data/FaceMaskDetection_Processed/" # path to the directory with the relevant data
images_dir_path = data_dir_path + "images/"         # path to the directory with the images
images_files = os.listdir(images_dir_path)           # list of files in the image directory

annotations = pd.read_csv(data_dir_path + "annotations.csv", index_col=None) # dataframe with information about the images and their bounding boxes
display(annotations)

Unnamed: 0,image_id,image_height,image_width,box_id,box_label,xmin,xmax,ymin,ymax
0,0,366,512,0,0,79,109,105,142
1,0,366,512,1,2,185,226,100,144
2,0,366,512,2,0,325,360,90,141
3,1,73,60,0,0,15,45,18,54
4,2,73,60,0,0,15,45,18,54
...,...,...,...,...,...,...,...,...,...
9395,6176,216,184,0,1,46,138,54,162
9396,6177,216,184,0,1,46,138,54,162
9397,6178,216,184,0,1,46,138,54,162
9398,6179,216,184,0,1,46,138,54,162


In [6]:
annotations["box_label"].value_counts()

0    3585
2    3232
1    2583
Name: box_label, dtype: int64

In [7]:
class FaceMaskDataset(Dataset):

    def __init__(self, annotations, images_dir_path, images_files):
        self.annotations = annotations
        self.images_dir_path = images_dir_path
        self.images_files = images_files
    
    def __len__(self):
        return len(self.images_files)
    
    def __getitem__(self, idx):

        img = cv2.imread(self.images_dir_path+self.images_files[idx])
        img = transforms.Resize((256,256))(torch.Tensor(img).permute(2,0,1))

        img_id = int(self.images_files[idx][:-4])
        img_annotations = self.annotations[self.annotations["image_id"] == img_id]

        img_height = int(list(img_annotations["image_height"])[0])
        img_width = int(list(img_annotations["image_width"])[0])
        xmins = [256*xmin/img_width for xmin in list(img_annotations["xmin"])]
        ymins = [256*ymin/img_height for ymin in list(img_annotations["ymin"])]
        xmaxs = [256*xmax/img_width for xmax in list(img_annotations["xmax"])]
        ymaxs = [256*ymax/img_height for ymax in list(img_annotations["ymax"])]

        target =  {
            "boxes": torch.as_tensor([[xmins[i], ymins[i], xmaxs[i], ymaxs[i]] for i in range(len(img_annotations))], dtype=torch.float32),
            "labels": torch.as_tensor(list(img_annotations["box_label"]), dtype=torch.int64),
            "image_id": torch.as_tensor([img_id]),
            "area": torch.as_tensor([(xmaxs[i]-xmins[i])*(ymaxs[i]-ymins[i]) for i in range(len(img_annotations))], dtype=torch.float32),
            "iscrowd": torch.zeros((len(img_annotations),), dtype=torch.int64)
        }

        return img, target

90% of the whole dataset is dedicated to training and the 10% left is used as a test dataset.

In [8]:
FMD = FaceMaskDataset(annotations, images_dir_path, images_files)

train_ratio = 0.9
trainset, testset = torch.utils.data.random_split(FMD, [int(train_ratio*len(FMD)), len(FMD)-int(train_ratio*len(FMD))])

batch_size = 2

trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=utils.collate_fn)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, collate_fn=utils.collate_fn)

___

## **2. THE MODELS**

### *2.1 Faster-RCNN (ResNet50)*

#### Defining the models

In [9]:
def get_rcnn_model(nb_classes):
    # load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, nb_classes)
    return model

In [10]:
modelRCNN = get_rcnn_model(nb_classes=3)
modelRCNN.to(device)
try:
    modelRCNN.load_state_dict(torch.load("./models/MaskRecognitionFasterRCNN.pt"))
    print("model loaded")
except:
    print("new model")
    pass
modelRCNN.eval()

model loaded


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

#### Training the model

In [11]:
params = [p for p in modelRCNN.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

num_epochs = 30

In [12]:
for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    engine.train_one_epoch(modelRCNN, optimizer, trainloader, device, epoch, print_freq=len(trainloader)//3)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    engine.evaluate(modelRCNN, testloader, device=device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Epoch: [0]  [   0/2781]  eta: 1:41:10  lr: 0.000001  loss: 0.0405 (0.0405)  loss_classifier: 0.0096 (0.0096)  loss_box_reg: 0.0268 (0.0268)  loss_objectness: 0.0000 (0.0000)  loss_rpn_box_reg: 0.0040 (0.0040)  time: 2.1830  data: 0.0270  max mem: 1619
Epoch: [0]  [ 927/2781]  eta: 0:17:00  lr: 0.000464  loss: 0.0509 (0.0703)  loss_classifier: 0.0159 (0.0220)  loss_box_reg: 0.0360 (0.0418)  loss_objectness: 0.0002 (0.0019)  loss_rpn_box_reg: 0.0014 (0.0046)  time: 0.5576  data: 0.0229  max mem: 1881
Epoch: [0]  [1854/2781]  eta: 0:08:39  lr: 0.000500  loss: 0.0865 (0.0746)  loss_classifier: 0.0309 (0.0222)  loss_box_reg: 0.0427 (0.0442)  loss_objectness: 0.0047 (0.0030)  loss_rpn_box_reg: 0.0026 (0.0052)  time: 0.5724  data: 0.0250  max mem: 1881
Epoch: [0]  [2780/2781]  eta: 0:00:00  lr: 0.000500  loss: 0.0552 (0.0742)  loss_classifier: 0.0166 (0.0219)  loss_box_reg: 0.0387 (0.0428)  loss_objectness: 0.0001 (0.0038)  loss_rpn_box_reg: 0.0006 (0.0056)  time: 0.5642  data: 0.0241  max me

KeyboardInterrupt: 

#### Saving the model

In [13]:
torch.save(modelRCNN.state_dict(), "./models/MaskRecognitionFasterRCNN.pt")

#### Testing the model

In [14]:
modelRCNN = get_rcnn_model(nb_classes=3)
modelRCNN.to(device)
modelRCNN.load_state_dict(torch.load("./models/MaskRecognitionFasterRCNN.pt"))
modelRCNN.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [26]:
def show_random_results(model, nb_images):

    img_ids = [0,4,5,6,7,10,11]

    for img_id in img_ids:

        img = cv2.imread(images_dir_path+"{}.png".format(img_id))
        cv2.imshow("before | {}.png".format(img_id), img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        h,w,c = img.shape
        overlay = img.copy()
        output = img.copy()

        model_input = transforms.Resize((256,256))(torch.Tensor(img).permute(2,0,1))
        model_input = model_input.reshape((1,3,256,256)).to(device)
        target = model(model_input)[0]

        for i in range(len(target["boxes"])):

            box = target["boxes"][i]
            label = int(target["labels"][i])
            xmin = int(w*box[0]/256)
            ymin = int(h*box[1]/256)
            xmax = int(w*box[2]/256)
            ymax = int(h*box[3]/256)
            
            if label == 0:
                print(0)
                cv2.rectangle(overlay, (xmin,ymin), (xmax,ymax), (0,0,255), 2)
            
            elif label == 1:
                print(1)
                cv2.rectangle(overlay, (xmin,ymin), (xmax,ymax), (0,200,200), 2)
            
            elif label == 2:
                print(2)
                cv2.rectangle(overlay, (xmin,ymin), (xmax,ymax), (0,255,0), 2)

        output = cv2.addWeighted(overlay, 0.5, output, 0.5, 0, output)        
        cv2.imshow("after | maksssksksss{}.png".format(img_id), output)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

In [27]:
for i in range(len(FMD)):
    break
    t = modelRCNN(FMD[i][0].reshape(1,3,256,256).to(device))[0]
    if 1 in t["labels"]:
        print(i)

In [28]:
show_random_results(modelRCNN, 1)

1


___
___
