In [7]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import cv2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
from keras import Model
from tensorflow.keras.layers import Input,Activation,Add, MaxPooling2D, MaxPooling1D, Flatten, Dense, Conv1D, Dropout
from tensorflow import keras
from tensorflow.keras import layers, models
from skimage import color
from skimage import filters
import tensorflow as tf

In [8]:
def robertsSobelEdges(imagesArray):
    """
    returns two numpy arrays with pictures in black and white and with the edges define using Roberts filter
    on the first one and the Sobel filter on the second one. The images are in black and white
    """
    sobel_ = []
    for image in imagesArray:
        grayImg = color.rgb2gray(image)
        edge_sobel = filters.sobel(grayImg)
        sobel_.append(edge_sobel)
    return np.array(sobel_)

In [9]:
# Choose the label type
Labeltype = 'age'

In [10]:
# Import image labels and paths to dataframe
mypath = "../data/raw/CroppedImages/"
filenames = np.array([f for f in listdir(mypath) if isfile(join(mypath, f))])
splitcolumns = [x.split('_')[0:3] + [mypath + x] for x in filenames if x.count('_') == 3]
filecolumns = ['age','gender','race','file']
filedf = pd.DataFrame(data = splitcolumns, columns = filecolumns).astype({'age': 'int', 'gender': 'int', 'race': 'int'})

# Convert age to appropriate labels
filedf['age']=np.where((filedf.age<3), 0, filedf.age)
filedf['age']=np.where(((filedf.age>=3) & (filedf.age<6)), 1, filedf.age)
filedf['age']=np.where(((filedf.age>=6) & (filedf.age<9)), 2, filedf.age)
filedf['age']=np.where(((filedf.age>=9) & (filedf.age<12)), 3, filedf.age)
filedf['age']=np.where(((filedf.age>=12) & (filedf.age<21)), 4, filedf.age)
filedf['age']=np.where(((filedf.age>=21) & (filedf.age<36)), 5, filedf.age)
filedf['age']=np.where(((filedf.age>=36) & (filedf.age<51)), 6, filedf.age)
filedf['age']=np.where(((filedf.age>=51) & (filedf.age<80)), 7, filedf.age)
filedf['age']=np.where((filedf.age>=80), 8, filedf.age)

dfx = filedf.loc[:,'file']
dfy = filedf.loc[:,Labeltype]

# Get Train/Test
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.3, random_state=42)

# Get Train/Test/Validation dataset 0.25 * 8 = 0.2
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Encode training labels and initialize random over sampler
yencoded = LabelEncoder().fit_transform(y_train)
oversample = RandomOverSampler()

# Over sample until training classes are balanced
X_train_balanced, y_train_balanced = oversample.fit_resample(np.array(X_train).reshape(-1,1), yencoded)

# Of all classes, proportionally sample 10000
dxy = {'file': X_train_balanced.reshape(len(X_train_balanced)), 'label': y_train_balanced}
dfbalanced = pd.DataFrame(data = dxy)
balancedsamples = int(5000/len(set(dfbalanced.loc[:,'label'])))
dfbalancedsubset = dfbalanced.groupby('label', group_keys=False).apply(lambda x: x.sample(balancedsamples)).sample(frac=1)

# Count and print balanced classes
counter = Counter(y_train_balanced)
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

# Count and print balanced classes subsets (Total should be 10000)
counter = Counter(dfbalancedsubset.loc[:,'label'])
for k,v in counter.items():
    per = v / len(dfbalancedsubset.loc[:,'label']) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=5, n=7150 (11.111%)
Class=6, n=7150 (11.111%)
Class=4, n=7150 (11.111%)
Class=2, n=7150 (11.111%)
Class=7, n=7150 (11.111%)
Class=0, n=7150 (11.111%)
Class=8, n=7150 (11.111%)
Class=3, n=7150 (11.111%)
Class=1, n=7150 (11.111%)
Class=0, n=555 (11.111%)
Class=8, n=555 (11.111%)
Class=7, n=555 (11.111%)
Class=2, n=555 (11.111%)
Class=6, n=555 (11.111%)
Class=4, n=555 (11.111%)
Class=1, n=555 (11.111%)
Class=3, n=555 (11.111%)
Class=5, n=555 (11.111%)


In [14]:
# Get training images
train_img_container = []
for i in dfbalancedsubset.loc[:,'file']:
    img = cv2.imread(i)
    train_img_container.append(img)
train_img_container = np.array(train_img_container)

# Get validation images
validation_img_container = []
for i in X_val:
    img = cv2.imread(i)
    validation_img_container.append(img)
validation_img_container = np.array(validation_img_container)

# Get testing images
test_img_container = []
for i in X_test:
    img = cv2.imread(i)
    test_img_container.append(img)
test_img_container = np.array(test_img_container)

# Convert images to sobel edges
train_img_container = robertsSobelEdges(train_img_container)
validation_img_container = robertsSobelEdges(validation_img_container)
test_img_container = robertsSobelEdges(test_img_container)

In [12]:
# Expand array dimensions to 4D so that they may be used as inputs
X_train_expand = tf.expand_dims(train_img_container, axis=-1)
X_val_expand = tf.expand_dims(validation_img_container, axis=-1)
X_test_expand = tf.expand_dims(test_img_container, axis=-1)

In [16]:
X_train_expand.shape

TensorShape([4995, 200, 200, 1])

In [17]:
X_test_expand.shape

TensorShape([7112, 200, 200, 1])

In [15]:
# Pick number of final nodes based on label picked
Nlabels = -1
if(Labeltype == 'age'):
    Nlabels = 9
if(Labeltype == 'gender'):
    Nlabels = 2
if(Labeltype == 'race'):
    Nlabels = 5

# Model creation
CNNmodel = models.Sequential()
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=X_train_expand[0].shape))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Flatten())
CNNmodel.add(layers.Dense(64, activation='relu'))
CNNmodel.add(layers.Dense(Nlabels))
CNNmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [18]:
# Validation training and testing
val_history = CNNmodel.fit(X_train_expand, dfbalancedsubset.loc[:,'label'], epochs=10, validation_data=(X_test_expand, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# Get validation accuracy and confusion matrix
val_accuracy = val_history.history['val_accuracy']
ypred = CNNmodel.predict(X_val_expand)
valconmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_val)
print(val_accuracy)
print(valconmatrix)

[0.3516135811805725, 0.39675173163414, 0.4944104552268982, 0.4184771180152893, 0.4062434136867523, 0.4589748978614807, 0.40202489495277405, 0.4351402521133423, 0.4663573205471039, 0.4528580605983734]
[[212  33   4   2   3   6   1   3   1]
 [ 76  85  33  18  18  30  11   7   1]
 [  6  14  14  16  14  23   6   3   0]
 [  4  13  22  21  28  50  18  10   3]
 [  2   5  11  23 107 339  48  19   1]
 [  5   5   7   8 113 997 213  93   4]
 [  2   2   4   3  42 505 345 249  20]
 [  0   0   0   1   8 107 143 296  45]
 [  1   0   0   0   3  16  21  49  70]]


In [39]:
# Get test accuracy and confusion matrix
ypred = CNNmodel.predict(X_test_expand)
testconmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_test)
test_accuracy = accuracy_score(y_test, np.argmax(ypred, axis=-1))
print(test_accuracy)
print(testconmatrix)

0.46150601139000214
[[224  27   2   0   3   7   1   2   0]
 [ 97  86  33   9  19  32   9   5   1]
 [  2  13  33  18  23  27   8   7   2]
 [  0   7  13  15  29  73   7  12   2]
 [  1  14  19  16 102 347  59  37   4]
 [  2   1   4   4  91 973 200  94   1]
 [  1   2   3   2  23 484 368 247  12]
 [  1   0   0   0  14 110 124 331  54]
 [  0   0   0   0   0  11  17  64  56]]
