In [None]:
import numpy as np
np.random.seed(1)
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import os

In [2]:
#reading images' filenames
seed_SH = 22
path = os.path.abspath(os.getcwd())
imgset_0_path = path+'/White/*.jpg'
imgset_1_path = path+'/Black/*.jpg'
imgset_2_path = path+'/Asian/*.jpg'
imgset_3_path = path+'/Indian/*.jpg'
imgset_4_path = path+'/Others/*.jpg'

filename_dataset0 = tf.data.Dataset.list_files(imgset_0_path, seed=seed_SH)
filename_dataset1 = tf.data.Dataset.list_files(imgset_1_path, seed=seed_SH)
filename_dataset2 = tf.data.Dataset.list_files(imgset_2_path, seed=seed_SH)
filename_dataset3 = tf.data.Dataset.list_files(imgset_3_path, seed=seed_SH)
filename_dataset4 = tf.data.Dataset.list_files(imgset_4_path, seed=seed_SH)

In [None]:
#creating equal distribution dataset
index = min(len(list(filename_dataset0)), len(list(filename_dataset1)), len(list(filename_dataset2)), len(list(filename_dataset3)), len(list(filename_dataset4)))
filename_dataset0 = filename_dataset0.take(index)  
filename_dataset1 = filename_dataset1.take(index)  
filename_dataset2 = filename_dataset2.take(index)  
filename_dataset3 = filename_dataset3.take(index)  
filename_dataset4 = filename_dataset4.take(index) 

filename_dataset = filename_dataset0.concatenate(filename_dataset1)
filename_dataset = filename_dataset.concatenate(filename_dataset2)
filename_dataset = filename_dataset.concatenate(filename_dataset3)
filename_dataset = filename_dataset.concatenate(filename_dataset4)
filename_dataset = filename_dataset.shuffle(buffer_size=10000, seed=42, reshuffle_each_iteration=False) 

# number of files to work with as data
nr_files = int(index*5*0.6)

#get 60% of original dataset as dataset to work with
filename_dataset = filename_dataset.take(nr_files)

In [4]:
#read filename and return resized normalized image with label
def load_image(image_path):
    label = tf.strings.split(image_path, sep='_')[2]
    label = tf.strings.to_number(label,out_type=tf.dtypes.int32)
    image = tf.io.read_file(image_path)  
    image = tf.image.decode_jpeg(image)
    image = tf.image.resize(image, [224, 224])
    image = tf.dtypes.cast(image,tf.float32)
    image = image/tf.math.reduce_max(image)
    return (image,label)

img_label = filename_dataset.map(load_image)

In [None]:
#check distribution of this dataset
white, black, indian, asian, others = 0,0,0,0,0
for i,j in img_label:
    if j == tf.Variable(0):
        white = white+1
    if j == tf.Variable(1):
        black = black+1
    if j == tf.Variable(2):
        asian = asian+1
    if j == tf.Variable(3):
        indian = indian+1
    if j == tf.Variable(4):
        others = others+1
np.std([white, black, asian, indian, others])
# print(white, black, asian,indian, others)

In [None]:
#split data into train, validation and test
split  = tf.math.round(nr_files*0.8) # is 80% of data
split  = tf.dtypes.cast(split,tf.int64)
train0 = img_label.take(split)
split  = tf.dtypes.cast(split,tf.float32)
val_split = tf.math.round(split * 0.75)
val_split = tf.dtypes.cast(val_split,tf.int64)
train = train0.take(val_split)
val   = train0.skip(val_split)
test  = img_label.skip(tf.dtypes.cast(split, tf.int64))

train = train.batch(64).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
val = val.batch(64).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test = test.batch(64).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [7]:
#building model
base_model = tf.keras.applications.VGG16(input_shape=(224,224,3), include_top=False, weights='imagenet')
base_model.trainable = False
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(320, activation='relu')(x)
x = tf.keras.layers.Dense(5, activation='sigmoid')(x)
model = tf.keras.Model(inputs=base_model.input, outputs=x)

In [None]:
# compile, fit and evaluate model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) 
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,restore_best_weights=True)
history = model.fit(train, validation_data=(val), epochs=200, verbose=1)
scores = model.evaluate(val)
print("%suracy validation set: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
# get accuracy score of test set
labels = []
for i, j in test:
    for a in j:
        labels.append(a.numpy()) 
labels = np.array(labels) 

results = model.predict(test)
pred_labels = tf.argmax(results, axis=1, output_type=tf.int32)
pred_labels = np.array(pred_labels)

m = tf.keras.metrics.Accuracy() 
_ = m.update_state(labels, pred_labels)
acc_test = m.result().numpy() 
print('accuracy test set: ', str(round(acc_test*100, 2)) + '%')

In [None]:
#plot loss function
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(val_loss) + 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

In [None]:
#distribution of categories in test set
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))

#distribution of categories predicted from the test set
unique, counts = np.unique(pred_labels, return_counts=True)
dict(zip(unique, counts))

In [None]:
# returns roc_auc score for each race category
def auc_score(pred_labels, labels, cat):
    if cat in pred_labels:
        pred = np.where(pred_labels==cat, 1, 0)
        true = np.where(labels==cat, 1, 0)
    else:
        return print('this category has not been predicted')
    return roc_auc_score(true, pred)

#get roc_auc scores for each category in list
auc_scores = []
for i in range(5):
    auc_scores.append(auc_score(pred_labels, labels, i))
print('List of ROC AUC scores for White, Black, Asian, Indian and Others category', auc_scores)

In [None]:
# returns for each falsely predicted category the number of times it was falsely predicted for each actual category
comparison = list(zip(pred_labels, labels))
def get_FN(comparison, cat): 
    freq_FN = {}
    for i,j in comparison:
        if j == cat and i != cat:
            if i in freq_FN:
                freq_FN[i] += 1
            else:
                freq_FN[i] = 1
    return freq_FN

categories= []
for i in range(5):
    FN = get_FN(comparison, i)
    categories.append(('category '+str(i), 'FN',FN))