In [26]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import cv2
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
from keras import Model
from tensorflow.keras.layers import Input,Activation,Add, MaxPooling2D, MaxPooling1D, Flatten, Dense, Conv1D, Dropout
from tensorflow import keras
from tensorflow.keras import layers, models
from skimage import color
from skimage import filters
import tensorflow as tf

In [27]:
def robertsSobelEdges(imagesArray):
    """
    returns two numpy arrays with pictures in black and white and with the edges define using Roberts filter
    on the first one and the Sobel filter on the second one. The images are in black and white
    """
    sobel_ = []
    for image in imagesArray:
        grayImg = color.rgb2gray(image)
        edge_sobel = filters.sobel(grayImg)
        sobel_.append(edge_sobel)
    return np.array(sobel_)

In [28]:
# Choose the label type
Labeltype = 'age'

In [29]:
# Import image labels and paths to dataframe
mypath = "../data/raw/CroppedImages/"
filenames = np.array([f for f in listdir(mypath) if isfile(join(mypath, f))])
splitcolumns = [x.split('_')[0:3] + [mypath + x] for x in filenames if x.count('_') == 3]
filecolumns = ['age','gender','race','file']
filedf = pd.DataFrame(data = splitcolumns, columns = filecolumns).astype({'age': 'int', 'gender': 'int', 'race': 'int'})

# Convert age to appropriate labels
filedf['age']=np.where((filedf.age<3), 0, filedf.age)
filedf['age']=np.where(((filedf.age>=3) & (filedf.age<6)), 1, filedf.age)
filedf['age']=np.where(((filedf.age>=6) & (filedf.age<9)), 2, filedf.age)
filedf['age']=np.where(((filedf.age>=9) & (filedf.age<12)), 3, filedf.age)
filedf['age']=np.where(((filedf.age>=12) & (filedf.age<21)), 4, filedf.age)
filedf['age']=np.where(((filedf.age>=21) & (filedf.age<36)), 5, filedf.age)
filedf['age']=np.where(((filedf.age>=36) & (filedf.age<51)), 6, filedf.age)
filedf['age']=np.where(((filedf.age>=51) & (filedf.age<80)), 7, filedf.age)
filedf['age']=np.where((filedf.age>=80), 8, filedf.age)

dfx = filedf.loc[:,'file']
dfy = filedf.loc[:,Labeltype]

# Get Train/Test
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.2, random_state=42)

# Get Train/Test/Validation dataset 0.25 * 8 = 0.2
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Encode training labels and initialize random over sampler
yencoded = LabelEncoder().fit_transform(y_train_2)
oversample = RandomOverSampler()

# Over sample until training classes are balanced
X_train_balanced, y_train_balanced = oversample.fit_resample(np.array(X_train_2).reshape(-1,1), yencoded)

# Of all classes, proportionally sample 10000
dxy = {'file': X_train_balanced.reshape(len(X_train_balanced)), 'label': y_train_balanced}
dfbalanced = pd.DataFrame(data = dxy)
balancedsamples = int(10000/len(set(dfbalanced.loc[:,'label'])))
dfbalancedsubset = dfbalanced.groupby('label', group_keys=False).apply(lambda x: x.sample(balancedsamples)).sample(frac=1)

# Count and print balanced classes
counter = Counter(y_train_balanced)
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

# Count and print balanced classes subsets (Total should be 10000)
counter = Counter(dfbalancedsubset.loc[:,'label'])
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=5, n=6093 (11.111%)
Class=2, n=6093 (11.111%)
Class=6, n=6093 (11.111%)
Class=0, n=6093 (11.111%)
Class=8, n=6093 (11.111%)
Class=7, n=6093 (11.111%)
Class=3, n=6093 (11.111%)
Class=4, n=6093 (11.111%)
Class=1, n=6093 (11.111%)
Class=8, n=1111 (2.026%)
Class=0, n=1111 (2.026%)
Class=6, n=1111 (2.026%)
Class=7, n=1111 (2.026%)
Class=2, n=1111 (2.026%)
Class=3, n=1111 (2.026%)
Class=5, n=1111 (2.026%)
Class=1, n=1111 (2.026%)
Class=4, n=1111 (2.026%)


In [30]:
# Get training images
train_img_container = []
for i in dfbalancedsubset.loc[:,'file']:
    img = cv2.imread(i)
    train_img_container.append(img)
train_img_container = np.array(train_img_container)

# Get validation images
validation_img_container = []
for i in X_val:
    img = cv2.imread(i)
    validation_img_container.append(img)
validation_img_container = np.array(validation_img_container)

# Get testing images
test_img_container = []
for i in X_test:
    img = cv2.imread(i)
    test_img_container.append(img)
test_img_container = np.array(test_img_container)

# Convert images to sobel edges
train_img_container = robertsSobelEdges(train_img_container)
validation_img_container = robertsSobelEdges(validation_img_container)
test_img_container = robertsSobelEdges(test_img_container)

In [31]:
# Expand array dimensions to 4D so that they may be used as inputs
X_train_expand = tf.expand_dims(train_img_container, axis=-1)
X_val_expand = tf.expand_dims(validation_img_container, axis=-1)
X_test_expand = tf.expand_dims(test_img_container, axis=-1)

In [32]:
# Pick number of final nodes based on label picked
Nlabels = -1
if(Labeltype == 'age'):
    Nlabels = 9
if(Labeltype == 'gender'):
    Nlabels = 2
if(Labeltype == 'race'):
    Nlabels = 5

# Model creation
CNNmodel = models.Sequential()
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=X_train_new[0].shape))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu'))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Flatten())
CNNmodel.add(layers.Dense(64, activation='relu'))
CNNmodel.add(layers.Dense(Nlabels))
CNNmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [33]:
# Validation training and testing
val_history = CNNmodel.fit(X_train_expand, dfbalancedsubset.loc[:,'label'], epochs=10, validation_data=(X_val_expand, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# Get validation accuracy and confusion matrix
val_accuracy = val_history.history['val_accuracy']
ypred = CNNmodel.predict(X_val_expand)
valconmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_val)
print(val_accuracy)
print(valconmatrix)

[0.3516135811805725, 0.39675173163414, 0.4944104552268982, 0.4184771180152893, 0.4062434136867523, 0.4589748978614807, 0.40202489495277405, 0.4351402521133423, 0.4663573205471039, 0.4528580605983734]
[[212  33   4   2   3   6   1   3   1]
 [ 76  85  33  18  18  30  11   7   1]
 [  6  14  14  16  14  23   6   3   0]
 [  4  13  22  21  28  50  18  10   3]
 [  2   5  11  23 107 339  48  19   1]
 [  5   5   7   8 113 997 213  93   4]
 [  2   2   4   3  42 505 345 249  20]
 [  0   0   0   1   8 107 143 296  45]
 [  1   0   0   0   3  16  21  49  70]]


In [39]:
# Get test accuracy and confusion matrix
ypred = CNNmodel.predict(X_test_expand)
testconmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_test)
test_accuracy = accuracy_score(y_test, np.argmax(ypred, axis=-1))
print(test_accuracy)
print(testconmatrix)

0.46150601139000214
[[224  27   2   0   3   7   1   2   0]
 [ 97  86  33   9  19  32   9   5   1]
 [  2  13  33  18  23  27   8   7   2]
 [  0   7  13  15  29  73   7  12   2]
 [  1  14  19  16 102 347  59  37   4]
 [  2   1   4   4  91 973 200  94   1]
 [  1   2   3   2  23 484 368 247  12]
 [  1   0   0   0  14 110 124 331  54]
 [  0   0   0   0   0  11  17  64  56]]


In [22]:
#Simplifies adding a layer
def Convolution(input_tensor,filters):

    x = layers.Conv2D(filters=filters,kernel_size=(3, 3),padding = 'same', activation = 'relu')(input_tensor)

    return x

def Convolution1D(input_tensor,filters):

    x = layers.Conv1D(filters=filters,kernel_size=(3),padding = 'same', activation = 'relu')(input_tensor)

    return x

#Simple CNN model
#Conv stands for neural network layers of (x,y). x standing for the input layers and y standing for how many filter layers there are
#maxp is a pooling layer that reduces the number of nodes of the next layer
#flatten changes the previous layer to a 1 dimensional layer
#dense layers have each node connected to every node in the previous layer. In this case, we have 3 dense layers of differing weights representing
#age, gender, and race. While all previous layers have collected features, dense layers calculate based on these collected features.
#They then send their calculations to the final dense layer to determine classification
#Binary Crossentropy = Classifies based on a binary value
#Sparse Categorical Crossentropy = Classifies based on an integer value with more than 2 possible values
def Simplemodel(input_shape,column):
  losstype = ""
  outputn = 0
  if(column == 'age'):
    losstype = "sparse_categorical_crossentropy"
    outputn = 9
  if(column == 'gender'):
    losstype = "binary_crossentropy"
    outputn = 1
  if(column == 'race'):
    losstype = "sparse_categorical_crossentropy"
    outputn = 5


  inputs = Input((input_shape))

  conv_1= Convolution(inputs,32)

  maxp_1 = MaxPooling2D(pool_size = (2,2)) (conv_1)

  conv_2 = Convolution(maxp_1,64)

  maxp_2 = MaxPooling2D(pool_size = (2, 2)) (conv_2)

  conv_3 = Convolution(maxp_2,64)

  maxp_3 = MaxPooling2D(pool_size = (2, 2)) (conv_3)

  conv_4 = Convolution(maxp_3,64)

  maxp_4 = MaxPooling2D(pool_size = (2, 2)) (conv_4)

  conv_5 = Convolution(maxp_4,64)

  flatten= Flatten() (conv_5)

  dense_1= Dense(64,activation='relu')(flatten)

  output_1= Dense(outputn,activation="sigmoid",name='out')(dense_1)

  model = Model(inputs=[inputs], outputs=[output_1])

  model.compile(loss=[losstype], optimizer="Adam",

  metrics=["accuracy"])

  return model

In [53]:
#Import image labels and paths to dataframe
mypath = "../data/raw/CroppedImages/"
filenames = np.array([f for f in listdir(mypath) if isfile(join(mypath, f))])
splitcolumns = [x.split('_')[0:3] + [mypath + x] for x in filenames if x.count('_') == 3]
filecolumns = ['age','gender','race','file']
filedf = pd.DataFrame(data = splitcolumns, columns = filecolumns).astype({'age': 'int', 'gender': 'int', 'race': 'int'})

filedf['age']=np.where((filedf.age<3), 0, filedf.age)
filedf['age']=np.where(((filedf.age>=3) & (filedf.age<6)), 1, filedf.age)
filedf['age']=np.where(((filedf.age>=6) & (filedf.age<9)), 2, filedf.age)
filedf['age']=np.where(((filedf.age>=9) & (filedf.age<12)), 3, filedf.age)
filedf['age']=np.where(((filedf.age>=12) & (filedf.age<21)), 4, filedf.age)
filedf['age']=np.where(((filedf.age>=21) & (filedf.age<36)), 5, filedf.age)
filedf['age']=np.where(((filedf.age>=36) & (filedf.age<51)), 6, filedf.age)
filedf['age']=np.where(((filedf.age>=51) & (filedf.age<80)), 7, filedf.age)
filedf['age']=np.where((filedf.age>=80), 8, filedf.age)

dfx = filedf.loc[:,'file']
dfy = filedf.loc[:,'race']

X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.33, random_state=42)

yencoded = LabelEncoder().fit_transform(y_train)
oversample = RandomOverSampler()
X_train_balanced, y_train_balanced = oversample.fit_resample(np.array(X_train).reshape(-1,1), yencoded)

counter = Counter(y_train_balanced)
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

y_train_balanced.shape

dxy = {'file': X_train_balanced.reshape(len(X_train_balanced)), 'label': y_train_balanced}
dfbalanced = pd.DataFrame(data = dxy)
balancedsamples = int(10000/len(set(dfbalancedsubset.loc[:,'label'])))
dfbalancedsubset = dfbalanced.groupby('label', group_keys=False).apply(lambda x: x.sample(balancedsamples)).sample(frac=1)

counter = Counter(dfbalancedsubset.loc[:,'label'])
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=6807 (20.000%)
Class=1, n=6807 (20.000%)
Class=4, n=6807 (20.000%)
Class=2, n=6807 (20.000%)
Class=3, n=6807 (20.000%)
Class=4, n=2000 (5.876%)
Class=1, n=2000 (5.876%)
Class=0, n=2000 (5.876%)
Class=3, n=2000 (5.876%)
Class=2, n=2000 (5.876%)


In [54]:
balancedsamples

2000

In [55]:
dfbalancedsubset.head()

Unnamed: 0,file,label
33330,../data/raw/CroppedImages/25_0_4_2017011715163...,4
18768,../data/raw/CroppedImages/52_0_1_2017011716101...,1
1680,../data/raw/CroppedImages/3_0_0_20170110212559...,0
33315,../data/raw/CroppedImages/7_1_4_20161223225914...,4
27171,../data/raw/CroppedImages/26_1_3_2017011919225...,3


In [56]:
train_img_container = []
for i in dfbalancedsubset.loc[:,'file']:
    img = cv2.imread(i)
    train_img_container.append(img)
train_img_container = np.array(train_img_container)
test_img_container = []
for i in X_test:
    img = cv2.imread(i)
    test_img_container.append(img)
test_img_container = np.array(test_img_container)
train_img_container = robertsSobelEdges(train_img_container)
test_img_container = robertsSobelEdges(test_img_container)

In [57]:
X_train_new = tf.expand_dims(train_img_container, axis=-1)
X_test_new = tf.expand_dims(test_img_container, axis=-1)
Modelsimple=Simplemodel(X_train_new[0].shape,'race')

In [58]:
History = Modelsimple.fit(X_train_new,dfbalancedsubset.loc[:,'label'],validation_data=(X_test_new,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
val_accuracy = History.history['val_accuracy']
ypred = Modelsimple.predict(X_test_new)
ypredmod = np.round(ypred)
#Use if label is age or race
conmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_test)
#Use if label is gender
#conmatrix = confusion_matrix(np.round(ypred),y_test)

In [60]:
print(val_accuracy)
print(conmatrix)

[0.5513229966163635, 0.5869870781898499, 0.6234181523323059, 0.6202224493026733, 0.6404193043708801, 0.6326217651367188, 0.6208615899085999, 0.629298210144043, 0.6477054953575134, 0.616259753704071]
[[1982  106  125  191  149]
 [ 295 1129  101  165   95]
 [ 238   60  771   64   48]
 [ 362  113   63  760  127]
 [ 394   96   65  152  172]]


In [6]:
dfx = filedf.loc[:,'file']
dfyage = filedf.loc[:,'gender']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dfx, dfyage, test_size=0.33, random_state=42)

In [8]:
yencoded = LabelEncoder().fit_transform(y_train)
oversample = RandomOverSampler()
X_train_balanced, y_train_balanced = oversample.fit_resample(np.array(X_train).reshape(-1,1), yencoded)

In [9]:
counter = Counter(y_train_balanced)
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=1, n=8307 (50.000%)
Class=0, n=8307 (50.000%)


In [10]:
y_train_balanced.shape

(16614,)

In [11]:
dxy = {'file': X_train_balanced.reshape(len(X_train_balanced)), 'label': y_train_balanced}
dfbalanced = pd.DataFrame(data = dxy)

In [18]:
dfbalancedsubset = dfbalanced.groupby('label', group_keys=False).apply(lambda x: x.sample(4500))

In [19]:
counter = Counter(dfbalancedsubset.loc[:,'label'])
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=4500 (27.086%)
Class=1, n=4500 (27.086%)


In [20]:
train_img_container = []
for i in dfbalancedsubset.loc[:,'file']:
    img = cv2.imread(i)
    train_img_container.append(img)
train_img_container = np.array(train_img_container)
test_img_container = []
for i in X_test:
    img = cv2.imread(i)
    test_img_container.append(img)
test_img_container = np.array(test_img_container)
train_img_container = robertsSobelEdges(train_img_container)
test_img_container = robertsSobelEdges(test_img_container)

In [21]:
X_train_new = tf.expand_dims(train_img_container, axis=-1)
X_test_new = tf.expand_dims(test_img_container, axis=-1)
Modelsimple=Simplemodel(X_train_new[0].shape,'gender')

In [22]:
History = Modelsimple.fit(X_train_new,dfbalancedsubset.loc[:,'label'],validation_data=(X_test_new,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
val_accuracy = History.history['val_accuracy']
ypred = Modelsimple.predict(X_test_new)
conmatrix = confusion_matrix(np.round(ypred),y_test)

In [29]:
print(val_accuracy)
print(conmatrix)

[0.7682474851608276, 0.8223187923431396, 0.830883264541626, 0.8265371322631836, 0.8466061353683472, 0.8515914678573608, 0.8505688309669495, 0.843154788017273, 0.838169515132904, 0.8459669947624207]
[[3577  698]
 [ 507 3041]]


In [32]:
dfx = filedf.loc[:,'file']
dfyage = filedf.loc[:,'race']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(dfx, dfyage, test_size=0.33, random_state=42)

In [34]:
yencoded = LabelEncoder().fit_transform(y_train)
oversample = RandomOverSampler()
X_train_balanced, y_train_balanced = oversample.fit_resample(np.array(X_train).reshape(-1,1), yencoded)

In [35]:
counter = Counter(y_train_balanced)
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=6807 (20.000%)
Class=1, n=6807 (20.000%)
Class=4, n=6807 (20.000%)
Class=2, n=6807 (20.000%)
Class=3, n=6807 (20.000%)


In [36]:
y_train_balanced.shape

(34035,)

In [37]:
dxy = {'file': X_train_balanced.reshape(len(X_train_balanced)), 'label': y_train_balanced}
dfbalanced = pd.DataFrame(data = dxy)

In [38]:
dfbalancedsubset = dfbalanced.groupby('label', group_keys=False).apply(lambda x: x.sample(1800))

In [39]:
counter = Counter(dfbalancedsubset.loc[:,'label'])
for k,v in counter.items():
    per = v / len(y_train_balanced) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=1800 (5.289%)
Class=1, n=1800 (5.289%)
Class=2, n=1800 (5.289%)
Class=3, n=1800 (5.289%)
Class=4, n=1800 (5.289%)


In [40]:
train_img_container = []
for i in dfbalancedsubset.loc[:,'file']:
    img = cv2.imread(i)
    train_img_container.append(img)
train_img_container = np.array(train_img_container)
test_img_container = []
for i in X_test:
    img = cv2.imread(i)
    test_img_container.append(img)
test_img_container = np.array(test_img_container)
train_img_container = robertsSobelEdges(train_img_container)
test_img_container = robertsSobelEdges(test_img_container)

In [41]:
X_train_new = tf.expand_dims(train_img_container, axis=-1)
X_test_new = tf.expand_dims(test_img_container, axis=-1)
Modelsimple=Simplemodel(X_train_new[0].shape,'race')

In [42]:
History = Modelsimple.fit(X_train_new,dfbalancedsubset.loc[:,'label'],validation_data=(X_test_new,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
val_accuracy = History.history['val_accuracy']
ypred = Modelsimple.predict(X_test_new)
conmatrix = confusion_matrix(np.argmax(ypred, axis=-1),y_test)

In [44]:
print(val_accuracy)
print(conmatrix)

[0.4747539162635803, 0.5826409459114075, 0.6163875460624695, 0.6073117852210999, 0.6117857694625854, 0.5986194610595703, 0.6032212972640991, 0.6043717265129089, 0.6168988943099976, 0.5923558473587036]
[[1884  141  135  170  141]
 [ 192  977   69  118   57]
 [ 223   75  745   57   40]
 [ 404  172   67  784  110]
 [ 568  139  109  203  243]]
