# Data

This notebook creates the masked images and binary mask segmentations from the RAFDB dataset using [MaskTheFace](https://github.com/aqeelanwar/MaskTheFace). Prerequisite is to download the RAFDB dataset from [Kaggle](https://www.kaggle.com/datasets/shuvoalok/raf-db-dataset)

## Imports and Definitions

In [None]:
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
DATA_PATH = './data'
RAFDB_PATH = DATA_PATH + '/RAFDB' # path to RAFDB dataset
MASKED_RAFDB_PATH = DATA_PATH + '/Masked_RAFDB' # path to masked versions
BINARY_RAFDB_PATH = DATA_PATH + '/Binary_RAFDB' # path to binary masks

## Flatten RAFDB train and test data

In [None]:
def flatten_data(src):
    """
    Given a directory src, unpack the contents of its sub-directories into the directory src.
    """
    subdirs = [dir for dir in os.listdir(src) if os.path.isdir(os.path.join(src, dir))]

    for dir in subdirs:
        subdir_path = os.path.join(src, dir)
        files = os.listdir(subdir_path)
        loop = tqdm(files)
        loop.set_description(f'Unpacking sub-directory {dir}')
        for file in loop:
            os.rename(os.path.join(subdir_path, file), os.path.join(src, file))
            
        os.rmdir(subdir_path)

In [None]:
flatten_data(RAFDB_PATH + '/train')
flatten_data(RAFDB_PATH + '/test')

## Clone MaskTheFace

In [None]:
!git clone https://github.com/aqeelanwar/MaskTheFace.git
%pip install -r MaskTheFace/requirements.txt

Run the following python command in the terminal in the MaskTheFace directory for the train and test datasets which generates directories `_masked` containing the masked images

`$python mask_the_face.py --path <path-to-file-or-dir> --mask_type <type-of-mask> --verbose --write_original_image`

Example:

`$python mask_the_face.py --path ../data/RAFDB/train --mask_type 'cloth' --color '#000000' --verbose --write_original_image`

`$python mask_the_face.py --path ../data/RAFDB/test --mask_type 'cloth' --color '#000000' --verbose --write_original_image` 

## Move the masked images to the Masked_RAFDB directory

In [None]:
!mkdir {MASKED_RAFDB_PATH}
!mkdir {MASKED_RAFDB_PATH}/train
!mkdir {MASKED_RAFDB_PATH}/test

In [None]:
def move_files(src, dest):
    """Move the files of src to dest"""
    all_files = os.listdir(src)
    for file in all_files:
        src_path = os.path.join(src, file)
        dest_path = os.path.join(dest, file)
        os.rename(src_path, dest_path)
        
move_files(RAFDB_PATH + '/train_masked', MASKED_RAFDB_PATH + '/train')
move_files(RAFDB_PATH + '/test_masked', MASKED_RAFDB_PATH + '/test')

## Remove non-pairs of masked and unmasked images

Because the MaskTheFace tool we use is not 100% accurate, we find and remove images from the RAFDB dataset which were not able to be masked

In [None]:
print('Number of non-masked training images:', len(os.listdir(os.path.join(RAFDB_PATH, 'train'))))
print('Number of non-masked test images:', len(os.listdir(os.path.join(RAFDB_PATH, 'test'))))

print('Number of masked training images:', len(os.listdir(os.path.join(MASKED_RAFDB_PATH, 'train'))))
print('Number of masked test images:', len(os.listdir(os.path.join(MASKED_RAFDB_PATH, 'test'))))

In [None]:
def remove_invalid_images(unmasked_src, masked_src):
    """Removes unpaired images from unmasked_src directory if its pair is not found in masked_src directory"""
    masked_all_files = os.listdir(masked_src)
    unmasked_all_files = os.listdir(unmasked_src)
    
    for unmasked_file in unmasked_all_files:
        file_idx = unmasked_file.split('_')[1:2][0]
        valid = any(file_idx in m_file for m_file in masked_all_files)
        
        # Remove un-paired file from unmasked_src
        if not valid:
            os.remove(os.path.join(unmasked_src, unmasked_file))

remove_invalid_images(RAFDB_PATH + '/train', MASKED_RAFDB_PATH + '/train')
remove_invalid_images(RAFDB_PATH + '/test', MASKED_RAFDB_PATH + '/test')

In [None]:
print('Number of non-masked training images: ', len(os.listdir(os.path.join(RAFDB_PATH, 'train'))))
print('Number of non-masked test images: ', len(os.listdir(os.path.join(RAFDB_PATH, 'test'))))

print('Number of masked training images: ', len(os.listdir(os.path.join(MASKED_RAFDB_PATH, 'train'))))
print('Number of masked test images: ', len(os.listdir(os.path.join(MASKED_RAFDB_PATH, 'test'))))

## Create binary mask segmentations

In [None]:
!mkdir {BINARY_RAFDB_PATH}
!mkdir {BINARY_RAFDB_PATH}/train
!mkdir {BINARY_RAFDB_PATH}/test

In [None]:
def create_binary_masks(unmasked_src, masked_src, dest, affix):
    """Creates binary mask segmentations from the unmasked and masked images and writes the image to dest with filename prepended with the affix string"""
    unmasked_all_files = os.listdir(unmasked_src)
    masked_all_files = os.listdir(masked_src)
    
    # Sort the files
    unmasked_all_files.sort()
    masked_all_files.sort()
    
    for i in range(0, len(unmasked_all_files)):
        unmasked_path = os.path.join(unmasked_src, unmasked_all_files[i])
        masked_path = os.path.join(masked_src, masked_all_files[i])
        
        # Read masked and unmasked images
        unmasked_img = cv2.imread(unmasked_path)
        masked_img = cv2.imread(masked_path)
        
        # Convert images to grayscale
        unmasked_grey_img = cv2.cvtColor(unmasked_img, cv2.COLOR_BGR2GRAY)
        masked_grey_img = cv2.cvtColor(masked_img, cv2.COLOR_BGR2GRAY)
        
        # Find absolute difference between image pair
        binary_mask = cv2.absdiff(masked_grey_img, unmasked_grey_img)
        # Apply threshold
        _, binary_mask = cv2.threshold(binary_mask, 0, 255, cv2.THRESH_BINARY  | cv2.THRESH_OTSU)
        # Apply dilation
        binary_mask = cv2.dilate(binary_mask, kernel=(3, 3), iterations=10)
        
        # Write binary mask to dest
        cv2.imwrite(os.path.join(dest, f'{affix}_{str(i + 1).zfill(5)}_binary.jpg'), binary_mask)

    _, ax = plt.subplots(1, 3, figsize=(10, 5), subplot_kw={'xticks': [], 'yticks': []})
        
    ax[0].imshow(cv2.cvtColor(unmasked_img, cv2.COLOR_BGR2RGB))
    ax[0].set_title('Unmasked')
    ax[1].imshow(cv2.cvtColor(masked_img, cv2.COLOR_BGR2RGB))
    ax[1].set_title('Masked')
    ax[2].imshow(binary_mask, cmap="gray")
    ax[2].set_title('Binary mask')
    
create_binary_masks(RAFDB_PATH + '/train', MASKED_RAFDB_PATH + '/train', BINARY_RAFDB_PATH + '/train', affix='train')
create_binary_masks(RAFDB_PATH + '/test', MASKED_RAFDB_PATH + '/test', BINARY_RAFDB_PATH + '/test', affix='test')