# Preprocess

Load the raw dataset, extract labels from filenames, resize images to 96x96 (from 96x103) so that they can be fed into a CNN, and save processed arrays.

<a name='1'></a>
## 1 - Packages

In [66]:
import cv2
import matplotlib.pyplot as plt
import numpy as np

import glob, os

<a name='1'></a>
## 2 - Processing

<a name='2.1'></a>
### 2.1 - Image
Loads a grayscale image and resizes it to 96x96.

In [67]:
def process_image(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    return cv2.resize(img, (96, 96))

<a name='2.2'></a>
### 2.2 - Labels  
Extracts label information (subject ID, gender, side, finger) from filename. If the image is from an altered dataset, the filename format slightly differs.

In [68]:
def process_label(path, altered = True):
    filename, _ = os.path.splitext(os.path.basename(img_path))
    subject_id, etc = filename.split('__')
    if altered:  gender, side, finger, _, _ = etc.split('_')
    else:        gender, side, finger, _ = etc.split('_')

    # Convert categorical info to integers
    gender = 0 if gender == 'M' else 1
    side = 0 if side =='Left' else 1

    if finger == 'thumb':     finger = 0
    elif finger == 'index':   finger = 1
    elif finger == 'middle':  finger = 2
    elif finger == 'ring':    finger = 3
    elif finger == 'little':  finger = 4

    return np.array([subject_id, gender, side, finger], dtype=np.uint16)

<a name='2.3'></a>
### 2.3 - Dataset locations 
Different folders for real and altered images, categorized by difficulty.

In [69]:
img_dir = [
    "Real",
    "Altered/Altered-Easy/",
    "Altered/Altered-Medium/",
    "Altered/Altered-Hard/"
]
out_tag = ["real", "easy", "medium", "hard"]

<a name='2.4'></a>
### 2.4 - Loop through  
For each folder:
 - Load and process each image
 - Extract corresponding labels
 - Save images and labels as .npy files

In [None]:
for i_key, i_path in enumerate(img_dir):
    img_list = sorted(glob.glob('dataset/raw/'+ i_path +'/*.BMP'))
            
    imgs = np.empty((len(img_list), 96, 96), dtype=np.uint8)
    labels = np.empty((len(img_list), 4), dtype=np.uint16)

    for i, img_path in enumerate(img_list):
        imgs[i] = process_image(img_path)
        labels[i] = process_label(img_path, i_key != 0)

    np.save('dataset/processed/x_'+out_tag[i_key], imgs)
    np.save('dataset/processed/y_'+out_tag[i_key], labels)
    
    plt.figure(figsize=(3, 3))
    plt.title(labels[0])
    plt.imshow(imgs[0], cmap='gray')