# Pain Data Preparation
This notebook prepares the pain dataset in to be able to successfully train a convolutional neural network. Data augmentation techniques such as greyscaling, histogram equalization, etc. are employed.

In [2]:
# Relevant imports
import os
import sys
import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Scripts import Data_Loader_Functions as dL
from Scripts import Image_Processor as IP

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# Define folder paths
RAW_DATA = os.path.join(module_path, "Data", "Raw Data")
PREPROCESSED_DATA = os.path.join(module_path, "Data", "Preprocessed Data")
AUGMENTED_DATA = os.path.join(module_path, "Data", "Augmented Data")
image_format = '.png'

In [4]:
def allocate_group(df, path):
    if not os.path.isdir(path):
        os.mkdir(path)

    for f_path in df['img_path'].values:
        os.rename(f_path, os.path.join(path, os.path.basename(f_path)))

# 1. Pre-Process and augment images


In [19]:
# Mirror Folder structure
print("Mirror Folder structure")
dL.mirror_folder_structure(RAW_DATA, PREPROCESSED_DATA)
dL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA)

Mirror Folder structure


KeyboardInterrupt: 

In [None]:
# Pre-process images
print("Pre-process Images")
last_file = IP.bulk_process_images(RAW_DATA, PREPROCESSED_DATA, image_format)

In [10]:
# Flip images and copy originals into augmented data folder
print("Flip Images")
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA, image_format, "flip", "pain", label_threshold=-1)
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA, image_format, "original", "pain", label_threshold=-1)

Flip Images


In [11]:
# Rotate Originals and flipped images, and ensure that naming conventions stay consistent
# Extra: Crops down rotated images to (215,215), otherwise we had images with smaller width, which caused issues later
print("Rotate Images")
IP.bulk_augment_images(AUGMENTED_DATA, AUGMENTED_DATA, format(image_format), "rotate_crop", "pain", label_threshold=-1)
# IP.bulk_augment_images(AUGMENTED_DATA, AUGMENTED_DATA, "_flipped{}".format(image_format), "rotate_crop", "pain", label_threshold=-1)
# IP.bulk_augment_images(AUGMENTED_DATA, AUGMENTED_DATA, "_original{}".format(image_format), "rotate_crop", "pain", label_threshold=-1)
IP.bulk_rename_files(AUGMENTED_DATA, AUGMENTED_DATA, "_rotated", "_straight")


Rotate Images


In [12]:
# Crop images to same maximum width and height (10-degree rotation in previous step cropped rotated images
# down to (215, 215), so this is chosen as a max width/height)
print("Crop Images")
IP.bulk_crop_images(AUGMENTED_DATA, AUGMENTED_DATA, (215, 215), image_format)

Crop Images


# Step 2: Reset Folder Structure

In [14]:
# Moving all images into the "raw" subfolder
dL.reset_to_raw(AUGMENTED_DATA, ext=image_format)

KeyboardInterrupt: 

In [15]:
# Deleting all empty folders
dL.delete_empty_folders(AUGMENTED_DATA)

# Step 3: Load DataFrame

In [16]:
# Get all image paths and corresponding labels into a dataframe
img_paths = np.array(dL.get_image_paths(AUGMENTED_DATA,ext=image_format))
labels = np.array(dL.get_labels(img_paths))
df = pd.DataFrame(labels, columns=['Person','Session','Culture','Frame','Pain', 'Trans_1', 'Trans_2'])
df[['Person','Session','Culture','Frame','Pain']] = df[['Person','Session','Culture','Frame','Pain']].astype(int)
df['img_path'] = img_paths
df[['Trans_1', 'Trans_2', 'img_path']] = df[['Trans_1', 'Trans_2', 'img_path']].astype(str)
df = df.sort_values(['Person', 'Session', 'Frame', 'Trans_1', 'Trans_2'], ascending=[True, True, True, False, False]).reset_index(drop=True)
df['temp_id'] = df['Person'].astype(str) + df['Session'].astype(str) + df['Frame'].astype(str)

#### Step 3.1: Remove Subject 101 from the data
Subject 101 only has negative examples "0" and will therefore show "0%" on metrics like "Recall" or "Precision", skewing output graphs.

In [17]:
# Proving that subject 101 only has 0 labels
subject = 101
print("# Pain Labels Subject {} : ".format(subject), np.sum(df[df['Person'] == subject]['Pain']))

# Pain Labels Subject 101 :  0


In [18]:
# Removing subject 101 from the data
df = df[df['Person'] != 101]

# Step 4: Redistribute Data for Training

In [19]:
# Split Data into two groups
group_1 = [42, 47, 49, 66, 95, 97, 103, 106, 108, 121, 123, 124]
df_1 = df[df['Person'].isin(group_1)]
df_2 = df[~df['Person'].isin(group_1)]

In [20]:
df_2['Person'].unique()

array([43, 48, 52, 59, 64], dtype=int64)

In [21]:
# Allocate Group 1
group_1_path = os.path.join(AUGMENTED_DATA, "group_1")
if not os.path.isdir(group_1_path):
    os.mkdir(group_1_path)
allocate_group(df_1, group_1_path)

In [22]:
# Allocate Group 2
group_2_path = os.path.join(AUGMENTED_DATA, "group_2")
if not os.path.isdir(group_2_path):
    os.mkdir(group_2_path)
allocate_group(df_2, group_2_path)