In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import tensorflow as tf 

from baseline import Baseline
from system_t import System_T

from tensorflow.keras.utils import to_categorical
import cv2
import os 

# Load UTKFace dataset

In [2]:
def ImgViewer(file_path):
    if type(img) == str:
        img = cv2.cvtColor(cv2.imread(img), cv2.COLOR_BGR2RGB)
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.imshow(img, interpolation='nearest')
    plt.show()
    plt.tight_layout()
    
file_name_list = os.listdir("/home/kihyun/system/utkface/dataset/UTKFace")

x_data = []
age_array = []
gender_array = []
race_array = []

for file_name in file_name_list:
    token = file_name.split('_')
    file_path = "/home/kihyun/system/utkface/dataset/UTKFace/" + file_name
  
    if len(token) == 4:
        if int(token[0]) >= 10 and int(token[0]) < 90:
            if int(token[2]) < 4:
                img = cv2.cvtColor(cv2.imread(file_path), cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (32, 32)) / 255.0
                x_data.append(img)
                age_array.append(int(token[0]))
                gender_array.append(int(token[1]))
                race_array.append(int(token[2]))

In [3]:
x_data = np.array(x_data)
age_array = np.array(age_array)
gender_array = np.array(gender_array)
race_array = np.array(race_array)

slice_array = race_array*2 + gender_array

np.unique(slice_array, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([4910, 3951, 2244, 2111, 1070, 1399, 2035, 1446]))

In [4]:
add_x_data = []
add_race_array = []
add_slice_array = []
race_num = 0
for race in ["whitemale", "whitefemale", "blackmale", "blackfemale", "asianmale", "asianfemale", "indianmale", "indianfemale"]:
    image_path = "/home/di_lab/crop_image/%s/" % race
    file_name_list = os.listdir(image_path)
    
    for file_name in file_name_list:
        file_path = image_path + file_name
        img = cv2.cvtColor(cv2.imread(file_path), cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (32, 32)) / 255.0
        add_x_data.append(img)
        
        add_race_array.append(race_num//2)
        add_slice_array.append(race_num)
    
    race_num += 1

In [5]:
add_x_data = np.array(add_x_data)
add_race_array = np.array(add_race_array)
add_slice_array = np.array(add_slice_array)

print(np.unique(add_slice_array, return_counts=True))
print(np.unique(add_race_array, return_counts=True))

(array([0, 1, 2, 3, 4, 5, 6, 7]), array([1009, 1069, 1000, 1035,  892,  947,  846,  912]))
(array([0, 1, 2, 3]), array([2078, 2035, 1839, 1758]))


# Basic setting: slices have the same amounts of data

In [6]:
def shuffle(data, label, race):
    shuffle = np.arange(len(data))
    np.random.shuffle(shuffle)
    data = data[shuffle]
    label = label[shuffle]
    race = race[shuffle]
    return data, label, race

num_class = len(np.unique(slice_array))
num_label = len(np.unique(race_array))
print("Number of slices : %d, %d" % (num_class, num_label))

mixed_data = (x_data, to_categorical(race_array), to_categorical(slice_array))
mturk_data = (add_x_data, to_categorical(add_race_array), to_categorical(add_slice_array))

initial_data_array = []
val_data_dict = []
add_data_dict = []
    
val_data_num = 500

for i in range(num_class):
    data_num = 500
    initial_data_array.append(data_num)
    idx = np.argmax(mixed_data[2], axis=1) == i
    idx_ = np.argmax(mturk_data[2], axis=1) == i
    
    val_data_dict.append((mixed_data[0][idx][data_num:data_num+val_data_num], mixed_data[1][idx][data_num:data_num+val_data_num], mixed_data[2][idx][data_num:data_num+val_data_num]))
    add_data_dict.append((mturk_data[0][idx_], mturk_data[1][idx_], mturk_data[2][idx_]))

    if i == 0:
        train_data = mixed_data[0][idx][:data_num]
        train_label = mixed_data[1][idx][:data_num]
        train_race =  mixed_data[2][idx][:data_num]
        
        val_data = mixed_data[0][idx][data_num:data_num+val_data_num]
        val_label = mixed_data[1][idx][data_num:data_num+val_data_num]
        val_race = mixed_data[2][idx][data_num:data_num+val_data_num]
    else:
        train_data = np.concatenate((train_data, mixed_data[0][idx][:data_num]), axis=0)
        train_label = np.concatenate((train_label, mixed_data[1][idx][:data_num]), axis=0) 
        train_race = np.concatenate((train_race, mixed_data[2][idx][:data_num]), axis=0) 
        
        val_data = np.concatenate((val_data, mixed_data[0][idx][data_num:data_num+val_data_num]), axis=0)
        val_label = np.concatenate((val_label, mixed_data[1][idx][data_num:data_num+val_data_num]), axis=0)   
        val_race = np.concatenate((val_race, mixed_data[2][idx][data_num:data_num+val_data_num]), axis=0)   
    
train_data, train_label, train_race = shuffle(train_data, train_label, train_race)
print(train_data.shape, train_label.shape, num_class)

Number of slices : 8, 4
(4000, 32, 32, 3) (4000, 4) 8


# Define slices

In [7]:
slice_desc = []
a = ["White-Male", "White-Female", "Black-Male", "Black-Female", "Asian-Male", "Asian-Female", "Indian-Male", "Indian-Female"]


for i in range(num_class):
    slice_desc.append('Slice: %s' % (a[i]))
    print('Slice: %s, Initial size: %s' % (a[i], initial_data_array[i]))

Slice: White-Male, Initial size: 500
Slice: White-Female, Initial size: 500
Slice: Black-Male, Initial size: 500
Slice: Black-Female, Initial size: 500
Slice: Asian-Male, Initial size: 500
Slice: Asian-Female, Initial size: 500
Slice: Indian-Male, Initial size: 500
Slice: Indian-Female, Initial size: 500


# Original ( with no data acquisition )

In [8]:
cost_func = [1] * num_class
lr = 0.0001

ori = Baseline((train_data, train_label, train_race), (val_data, val_label, val_race), val_data_dict, 
                initial_data_array, num_class, num_label, add_data_dict, method='Uniform')
ori.performance(budget=0, cost_func=cost_func, num_iter=10, batch_size=32, lr=lr, epochs=2000)

Method: Uniform, Budget: 0
[0 0 0 0 0 0 0 0]
Loss: 0.56167 (0.00584), Average EER: 0.09069 (0.01049), Max EER: 0.16540 (0.02290)



# System T Demo on UTKFace

## Use 3000 budget, lambda=0.1, "Moderate" strategy

In [9]:
budget = 3000
method = 'Moderate'

st = System_T((train_data, train_label, train_race), (val_data, val_label, val_race), val_data_dict, initial_data_array, num_class, num_label, add_data_dict)
st.selective_collect(budget=budget, k=10, batch_size=32, lr = lr, epochs=2000, cost_func=cost_func, 
                 Lambda=0.1, num_iter=5, slice_desc=slice_desc, strategy=method, show_figure=False)

[456 338 197 525 450 198 474 361]
Total Cost: 2999, Remaining Budget: 1

[456. 338. 197. 525. 450. 198. 474. 361.]
Number of iteration: 1
Strategy: Moderate, C: 0.1, Budget: 3000
Loss: 0.54677 (0.00847), Average EER: 0.07053 (0.01092), Max EER: 0.13685 (0.02185)



# Baseline: Uniform ( = Water fiiling )

## For a basic setting, Uniform method is equivalent to Water filling method

In [10]:
budget = 3000
uni = Baseline((train_data, train_label, train_race), (val_data, val_label, val_race), val_data_dict, 
                initial_data_array, num_class, num_label, add_data_dict, method='Uniform')
uni.performance(budget=budget, cost_func=cost_func, num_iter=10, batch_size=32, lr=lr, epochs=2000)

Method: Uniform, Budget: 3000
[375 375 375 375 375 375 375 375]
Loss: 0.55422 (0.00955), Average EER: 0.08128 (0.01761), Max EER: 0.18255 (0.04550)



# Summary of results

| Method | Loss | Avg.EER |
|:---------------------:|:---------------------:|:---------------------:|
| Original | 0.56167 (± 0.00292) | 0.09069 (± 0.00524) |
| Uniform | 0.55422 (± 0.00477) | 0.08128 (± 0.00881) |
| Water filling | 0.55422 (± 0.00477) | 0.08128 (± 0.00881) |
| Moderate (ours) | 0.54677 (± 0.00424) | 0.07053 (± 0.00546) |