# Training an unbiased Adversarial Classifier
This notebook is an implementation of the adversarially-unbiased classifier idea. In particular, this recreates the experiment from "[Censoring Representations With an Adversary](https://arxiv.org/pdf/1511.05897.pdf)", Edwards and Storkey, ICLR 2016.

The experiment uses the UCI 'Adult' dataset, which contains a bunch of individual information (including sex). Our goal is to predict the label class, in this case whether or not the individual made more than $50k a year. Simultaneously, we will train our classifier to be blind to the individual's sex. 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib
import os
logdir = '/tmp/debiased_classifier/'

# Download the 'Adult' dataset from the UCI dataset archive
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.'
traindata_url = url + 'data'
testdata_url = url + 'test'
trainfile = urllib.request.urlopen(traindata_url)
testfile = urllib.request.urlopen(testdata_url)

In [2]:
# These are a few utility functions for the tensorflow part of this implementation

def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True,
                include_final_partial_batch=True, repeat=False):
    assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
    arrays = tuple(map(np.asarray, arrays))
    n = arrays[0].shape[0]
    assert all(a.shape[0] == n for a in arrays[1:])
    inds = np.arange(n)
    first_time = True
    while first_time or repeat:
        if shuffle: np.random.shuffle(inds)
        sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
        for batch_inds in np.array_split(inds, sections):
            if include_final_partial_batch or len(batch_inds) == batch_size:
                yield tuple(a[batch_inds] for a in arrays)
        first_time = False

def _weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.get_variable("W", initializer=initial)

def _bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.get_variable("b", initializer=initial)

def _linear(x, output_size, name):
    with tf.variable_scope(name):
        W = _weight_variable([x.get_shape().as_list()[1], output_size])
        b = _bias_variable([output_size])
        output = tf.matmul(x, W) + b
    return output

def lrelu(x, leak=0.2, name="lrelu"):
    return tf.maximum(x, leak*x, name=name)

In [3]:
# The category names for the dataset
names = ["age","workclass","fnlwgt","education","education-num",
         "marital-status","occupation","relationship","race","sex",
         "capital-gain","capital-loss", "hours-per-week",
         "native-country","income>50k"]

In [4]:
# Loading and processing the data.

print("Downloading data...")
raw_train_df = pd.read_csv(trainfile, names=names, index_col=False, comment='|')
raw_test_df = pd.read_csv(testfile, names=names, index_col=False, comment='|')
print("Data downloaded!")

# We discard 'sex' (which is our protected variable) and 'income>50k' (which is our predicted variable)
# 'fnlwgt' is a scalar relating to the demographic importance of the individual, so we discard it too
train_df = pd.get_dummies(raw_train_df.drop(['fnlwgt', 'sex', 'income>50k'], axis=1))
test_df = pd.get_dummies(raw_test_df.drop(['fnlwgt', 'sex', 'income>50k'], axis=1))
extra_columns = list(set(train_df.columns) - set(test_df.columns))
for c in extra_columns:
    test_df[c] = 0
    
train_data = train_df.values
train_labels, _ = raw_train_df['income>50k'].factorize()
train_protected, _ = raw_train_df['sex'].factorize() # male = 0, female = 1
test_data = test_df.values
test_labels, _ = raw_test_df['income>50k'].factorize()
test_protected, _ = raw_test_df['sex'].factorize() # male = 0, female = 1

Downloading data...
Data downloaded!


In [5]:
print("fraction of women in training set: {:0.4}".format(train_labels.mean()))
print("fraction of women in test set: {:0.4}".format(test_labels.mean()))

fraction of women in training set: 0.2408
fraction of women in test set: 0.2362


In [6]:
lamda = 1 # relative weight between "classifying correctly" and "blinding the adversary"
stepsize = 1e-4
N_EPOCHS = 100
batchsize = 64

In [7]:
# Set up the training graph

tf.reset_default_graph()
sw = tf.summary.FileWriter(logdir)

# inputs
data_input = tf.placeholder(tf.float32, shape=[None, train_data.shape[1]]) # individual data
protected_input = tf.placeholder(tf.int32, shape=[None]) # gender data
label_input = tf.placeholder(tf.int32, shape=[None]) # over/under 50k salary data

# First we create the primary classifier network (referred to as the "business")
# It has 3 fully-connected layers with a sigmoid output.
with tf.variable_scope("business") as scope:
    x = data_input
    x = lrelu(_linear(x, 128, "fc1"))
    x = lrelu(_linear(x, 128, "fc2"))
    embedding = x
    y_logits_ = tf.squeeze(_linear(embedding, 1, "fc3"), axis=1)
    y_ = tf.sigmoid(y_logits_)
    # The label loss is the cross-entropy classifiction loss for whether the individual was over/under 50k
    label_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(label_input, tf.float32),
                                                                        logits=y_logits_))
    # Label accuracy is the percent of samples on which the predicted likelier outcome is the correct outcome
    label_accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(tf.round(y_), tf.int32), 
                                                     label_input), tf.float32))

# We also create an adversary "regulator", whose goal is to predict the protected attribute
# if she can, she knows the business is biased
with tf.variable_scope("regulator") as scope:
    s_logits_ = tf.squeeze(_linear(embedding, 1, "fc3"), axis=1)
    s_ = tf.sigmoid(s_logits_)
    # protected loss is the cross-entropy classification loss for gender prediction
    protected_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(protected_input, tf.float32),
                                                                            logits=s_logits_))
    protected_accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(tf.round(s_), tf.int32),
                                                         protected_input), tf.float32))
# The business can only update the primary classifier's weights, 
# and the regulator can only update the protected-attribute classifier weights
business_weights  = [var for var in tf.global_variables() if var.name[:8] == "business"]
regulator_weights = [var for var in tf.global_variables() if var.name[:9] == "regulator"]

#reg_loss = tf.reduce_sum([tf.nn.l2_loss(w) for w in weights])
total_loss = label_loss - lamda*protected_loss
# The business minimizes its normal label-prediction loss, while also maximizing the regulator's prediction loss
train_business_op  = tf.train.AdamOptimizer(stepsize).minimize(total_loss, var_list=business_weights)
# The regulator only wishes to minimize its own prediction loss
train_regulator_op = tf.train.AdamOptimizer(stepsize).minimize(protected_loss, var_list=regulator_weights)

In [8]:
y_male, y_female = tf.dynamic_partition(data=y_, partitions=protected_input, num_partitions=2)
# The discrimination metric, as defined by Edwards and Storkey, is sort of a measure of the average difference
# in prediction rates across gender - which they call 'discrimination'. This is of course a lacking definition, but
# this notebook is just about reimplementing their paper.
discrimination_metric = tf.abs(tf.reduce_mean(y_male) - tf.reduce_mean(y_female))

# Because the two networks (business and regulator) are competing with each other, we have to carefully balance their training.
# If the regulator gets too good, it will just force the business to purge all information so the regulator sees nothing.
# If the business gets too good, the regulator will never get a chance to start identifying sensitive attributes, 
# leading to a sexist classifier
# In the paper, they propose sometimes training the regulator 
# (if the regulator's classification accuracy is low, below 0.9),
# and sometimes training the business (if the regulator's classification accuracy is high, above 0.6)
# These two variabels capture the pct time spent training each
train_business_pct = tf.Variable(0.0)
train_regulator_pct = tf.Variable(0.0)

# training losses
train_summary_ops = []
val_summary_ops = []
losses = [discrimination_metric, label_accuracy, label_loss, protected_accuracy, 
          protected_loss, train_business_pct, train_regulator_pct]
loss_names = ["discrimination_metric", "label_acc", "label_loss", "protected_acc", "protected_loss",
              "train_business_pct", "train_regulator_pct"]
for pair in zip(losses, loss_names):
    train_summary_ops.append(tf.summary.scalar(tensor=pair[0], name="training/" + pair[1]))
    val_summary_ops.append(tf.summary.scalar(tensor=pair[0], name="validation/" + pair[1]))
train_summary_op = tf.summary.merge(train_summary_ops)
val_summary_op = tf.summary.merge(val_summary_ops)

global_step = tf.Variable(0.0)

In [None]:
# Here we actually run the training
# We recommend visualizing the results in tensorboard
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    should_train_business  = True
    should_train_regulator = True
    print("Starting training")
    print("To log, call\ntensorboard --logdir={}".format(logdir))
    print("open http://0.0.0.0:6006")
    
    for epoch in range(N_EPOCHS):
        train_business_time = train_regulator_time = 0
        for i, batch in enumerate(iterbatches(arrays=[train_data, train_labels, train_protected], 
                                              batch_size=batchsize, shuffle=True)):
            # load each data batch into the network
            data_batch, label_batch, protected_batch = batch
            feed_dict = {
                data_input: data_batch,
                label_input: label_batch,
                protected_input: protected_batch,
            }
            train_ops = []
            
            if should_train_business: train_ops.append(train_business_op)
            if should_train_regulator: train_ops.append(train_regulator_op)
            train_business_time += should_train_business
            train_regulator_time += should_train_regulator
            update_ops = [tf.assign(train_business_pct,  train_business_time/(i+1)), 
                         tf.assign(train_regulator_pct, train_regulator_time/(i+1)),
                         tf.assign_add(global_step, data_batch.shape[0])]
            
            # Train the network on a minibatch and update the weights of the network, plus bookkeeping.
            s_acc, train_summary, gs, *_ = sess.run(
                [protected_accuracy, train_summary_op, global_step] + update_ops + train_ops,
                feed_dict=feed_dict)
            sw.add_summary(train_summary, gs)
            # Pick whether, in the next timestep, to train the business, regulator, or both
            should_train_business  = s_acc.mean() > 0.6
            should_train_regulator = s_acc.mean() < 0.9
        
        # Evaluate the network on a validation set
        valbatch = next(iterbatches(arrays=[test_data, test_labels, test_protected], 
                                    batch_size=512, shuffle=True))
        data_batch, label_batch, protected_batch = valbatch
        feed_dict = {
            data_input: data_batch,
            label_input: label_batch,
            protected_input: protected_batch,
        }
        val_summary = sess.run(val_summary_op, feed_dict=feed_dict)
        sw.add_summary(val_summary, gs)
        sw.flush()
        # And finally, save the model weights (they'll be in /tmp/debiased_classifier/, unless you changed the logdir)
        saver.save(sess, os.path.join(logdir, 'model.ckpt'))

Starting training
To log, call
 tensorboard --logdir=/tmp/debiased_classifier/
open http://0.0.0.0:6006
