In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from baseline import Baseline
from system_t import System_T

# Load AdultCensus Dataset

In [2]:
def adult_custom_preprocessing(df):
    def group_race(x):
        if x == "White":
            return "White"
        elif x == "Black":
            return "Black"
        else:
            return "Other"

    def group_country(x):
        if x != "United-States":
            return "Non-US"
        else:
            return "US"
        
    
    df['race'] = df['race'].apply(lambda x: group_race(x))
    df['country'] = df['country'].apply(lambda x: group_country(x))
    
    return df


def transform(train_data, test_data):
    drop_columns = ['fnlwgt']

    train_data.drop(drop_columns, axis=1, inplace=True)
    test_data.drop(drop_columns, axis=1, inplace=True)

    train_data = adult_custom_preprocessing(train_data)
    test_data = adult_custom_preprocessing(test_data)

    x_train, y_train = data_preprocessing(train_data)
    x_test, y_test = data_preprocessing(test_data, False)

    missing_cols = set(x_train.columns) - set(x_test.columns)
    for column in missing_cols:
        x_test[column] = 0
    x_test = x_test[x_train.columns]

    return x_train, y_train, x_test, y_test

def data_preprocessing(data, train=True):
    data_copy = data.copy()
    data_copy["target"] = data_copy["target"].apply(lambda x:0 if (x=='<=50K' or x=='<=50K.') else 1)
    x_data = data_copy.drop('target', axis =1)
    y_data = data_copy["target"]        

    num_data = x_data.select_dtypes(exclude='object')
    cat_data = x_data.select_dtypes(include='object')

    if train:
        num_data = pd.DataFrame(scalar.fit_transform(num_data), columns=num_data.columns)
    else:
        num_data = pd.DataFrame(scalar.transform(num_data), columns=num_data.columns)
    cat_data = pd.get_dummies(cat_data)

    x_data = pd.concat([num_data, cat_data], axis=1)
    return x_data, y_data


schema = open("/home/kihyun/system/adult/data/schema").read().splitlines()
print(schema)
train_data = pd.read_csv("/home/kihyun/system/adult/data/train.data", sep=r'[,\t ]+', header=None, names=schema, na_values='?')
test_data = pd.read_csv("/home/kihyun/system/adult/data/test.data", sep=r'[,\t ]+', header=None, names=schema, na_values='?')    
    
train_data = train_data.dropna().reset_index(drop=True)
test_data = test_data.dropna().reset_index(drop=True)

scalar = StandardScaler()
x_train, y_train, x_test, y_test = transform(train_data, test_data)

x_data = pd.concat([x_train, x_test])
y_data = pd.concat([y_train, y_test]) 

['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'target']


# Basic setting: slices have the same amounts of data

In [3]:
def shuffle(data, label):
    shuffle = np.arange(len(data))
    np.random.shuffle(shuffle)
    data = data[shuffle]
    label = label[shuffle]
    return data, label

initial_data_array = []
val_data_dict = []
add_data_dict = []
    
val_data_num = 500

feature_index = 0
gender_list = ["sex_Female", "sex_Male"]
race_list = ["race_White", "race_Black"]

for gender in gender_list:
    for race in race_list:
        data_num = 150
        initial_data_array.append(data_num)
        temp_x, temp_y = x_data[(x_data[gender] == 1) & (x_data[race] == 1)], y_data[(x_data[gender] == 1) & (x_data[race] == 1)]

        val_data_dict.append((temp_x[data_num:data_num+val_data_num].to_numpy(), tf.keras.utils.to_categorical(temp_y[data_num:data_num+val_data_num])))
        add_data_dict.append((temp_x[data_num+val_data_num:].to_numpy(), tf.keras.utils.to_categorical(temp_y[data_num+val_data_num:])))
        if feature_index == 0:
            train_data = temp_x[:data_num]
            train_label = temp_y[:data_num]

            val_data = temp_x[data_num:data_num+val_data_num]
            val_label = temp_y[data_num:data_num+val_data_num]
        else:
            train_data = pd.concat([train_data, temp_x[:data_num]])
            train_label = pd.concat([train_label, temp_y[:data_num]]) 
            val_data = pd.concat((val_data, temp_x[data_num:data_num+val_data_num]))
            val_label = pd.concat((val_label, temp_y[data_num:data_num+val_data_num])) 
        feature_index += 1
        
num_label = len(np.unique(train_label))
num_class = feature_index 
print("Number of slices : %d, %d" % (num_class, num_label))

train_data = train_data.to_numpy()
train_label = tf.keras.utils.to_categorical(train_label)

val_data = val_data.to_numpy()
val_label = tf.keras.utils.to_categorical(val_label)

train_data, train_label = shuffle(train_data, train_label)

Number of slices : 4, 2


# Define slices

In [4]:
slice_desc = []
a = ["White-Female", "Black-Female", "White-Male", "Black-Male"]

for i in range(num_class):
    slice_desc.append('Slice: %s' % (a[i]))
    print('Slice: %s, Initial size: %s' % (a[i], initial_data_array[i]))
    
feature_list = np.array(x_data.columns)
slice_index = []
for gender in gender_list:
    for race in race_list:
        ind1 = np.where(feature_list == gender)[0][0]
        ind2 = np.where(feature_list == race)[0][0]
        slice_index.append([ind1, ind2])

Slice: White-Female, Initial size: 150
Slice: Black-Female, Initial size: 150
Slice: White-Male, Initial size: 150
Slice: Black-Male, Initial size: 150


# Original ( with no data acquisition ) 

In [5]:
cost_func = [1] * num_class
lr = 0.001

ori = Baseline((train_data, train_label), (val_data, val_label), val_data_dict, 
                initial_data_array, num_class, num_label, slice_index, add_data_dict, method='Uniform')
ori.performance(budget=0, cost_func=cost_func, num_iter=10, batch_size=32, lr=lr, epochs=2000)

Method: Uniform, Budget: 0
[0 0 0 0]
Loss: 0.26328 (0.00201), Average EER: 0.10336 (0.00163), Max EER: 0.16613 (0.00386)



# System T Demo on AdultCensus

## Use 300 budget, lambda=0.1, "Moderate" strategy

In [6]:
budget = 300
method = 'Moderate'

st = System_T((train_data, train_label), (val_data, val_label), val_data_dict, initial_data_array, num_class, num_label, slice_index, add_data_dict)
st.selective_collect(budget=budget, k=10, batch_size=32, lr = lr, epochs=2000, cost_func=cost_func, 
                 Lambda=0.1, num_iter=5, slice_desc=slice_desc, strategy=method, show_figure=False)

[  0  75 131  94]
Total Cost: 300, Remaining Budget: 0

[  0.  75. 131.  94.]
Number of iteration: 1
Strategy: Moderate, C: 0.1, Budget: 300
Loss: 0.25341 (0.00100), Average EER: 0.09448 (0.00052), Max EER: 0.14806 (0.00081)



# Baseline: Uniform ( = Water filling )

## For a basic setting, Uniform method is equivalent to Water filling method

In [7]:
budget = 300
uni = Baseline((train_data, train_label), (val_data, val_label), val_data_dict, 
                initial_data_array, num_class, num_label, slice_index, add_data_dict, method='Uniform')
uni.performance(budget=budget, cost_func=cost_func, num_iter=10, batch_size=32, lr=lr, epochs=2000)

Method: Uniform, Budget: 300
[75 75 75 75]
Loss: 0.25707 (0.00205), Average EER: 0.09862 (0.00086), Max EER: 0.15513 (0.00164)



# Summary of results

<!-- <div align="center"> -->
    
|<font size="5">Method</font>  | <font size="5"> Loss </font>| <font size="5"> Avg.EER </font>| 
|:---------------------:|:---------------------:|:---------------------:| 
|<font size=5> Original </font> | <font size=5> 0.26328  (&pm; 0.00101) </font> | <font size=5> 0.10336 (&pm; 0.00081)</font>|
|<font size=5> Uniform </font> | <font size=5> 0.25707  (&pm; 0.00103) </font> | <font size=5> 0.09862 (&pm; 0.00043)</font>|
|<font size=5> Water filling </font> | <font size=5> 0.25707  (&pm; 0.00103) </font> | <font size=5> 0.09862 (&pm; 0.00043)</font>|
|<font size=5> Moderate (ours) </font> | <font size=5> 0.25341  (&pm; 0.00050) </font> | <font size=5> 0.09448 (&pm; 0.00026)</font>|

</div>