# Adult Dataset
This notebook trains a classifier on the Adult dataset using tensorflow and computes influence scores.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
import tensorflow_datasets as tfds
import os
import os.path
from os import path
import time
import datetime

In [None]:
# 100 epochs will take ages on CPU, reduce or use GPU
EPOCHS = 50
BATCH_SIZE = 1000

## Load dataset

In [None]:
# load training, test and validation datasets
train_df = pd.read_csv("data/train-one-hot.csv")
train_target_df = train_df.pop('salary')

test_df = pd.read_csv("data/test-one-hot.csv")
test_target_df = test_df.pop('salary')

val_df = pd.read_csv("data/val-one-hot.csv")
val_target_df = val_df.pop('salary')

In [None]:
# print first 5 examples
train_df.head(5)

## Define models and start training

In [None]:
def get_compiled_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(63,)))
    model.add(tf.keras.layers.Dense(128, activation="relu"))
    model.add(tf.keras.layers.Dense(64, activation="relu"))
    model.add(tf.keras.layers.Dense(32, activation="relu"))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=['accuracy'])
    
    return model

In [None]:
# checkpoint callback
checkpoint_path = "cp_training_adult/cp-{epoch:04d}"
checkpoint_dir = os.path.dirname(checkpoint_path)

#BATCHES_PER_EPOCH = int(24129/BATCH_SIZE)

# creating a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 verbose=1)

In [None]:
model = get_compiled_model()
model.summary()

# save initialised model
model.save(checkpoint_path.format(epoch=0))

### Training model

In [None]:
start = time.time()
history = model.fit(train_df.values,
                    train_target_df.values,
                    validation_data=(val_df.values, val_target_df.values),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS, 
                    verbose=1,
                    shuffle=True,
                    callbacks=[cp_callback, tf.keras.callbacks.EarlyStopping(restore_best_weights=True, patience=20)])
end = time.time()
print("Total time:", datetime.timedelta(seconds=end - start))

## Results
Summary of the model performance. The defined methods can be used to compare multiple models.

In [None]:
# model plotter
def plot_model_results(history, clr, i="_alt"):
    ax[0].plot(history.history["loss"], "{}".format(clr), label="M{} Train loss".format(i), linewidth=2)
    ax[0].plot(history.history["val_loss"], "{}--".format(clr), label="M{} Val loss".format(i), linewidth=2)
    ax[1].plot(history.history["accuracy"], "{}".format(clr), label="M{} Train accuracy".format(i), linewidth=1.5)
    ax[1].plot(history.history["val_accuracy"], "{}--".format(clr), label="M{} Val accuracy".format(i), linewidth=1.5)
    ax[0].set_xlabel("$Epochs$", fontsize=16), ax[1].set_xlabel("$Epochs$", fontsize=16)
    ax[0].set_ylabel("$Loss$", fontsize=16), ax[1].set_ylabel("$Accuracy$", fontsize=16)
    ax[0].set_title("Loss", fontsize=18), ax[1].set_title("Accuracy", fontsize=18)
    ax[0].legend(frameon=False, fontsize=14), ax[1].legend(frameon=False, fontsize=14)
    
# print results
def return_results(model, test_features, test_labels, i="0"):
    # Evaluate model comparison
    loss, acc = model.evaluate(test_features, test_labels, verbose=0)
    print("M{}|| Accuracy: {:.2f}% --- Loss: {:.2f}".format(i, 100 * acc, loss))

In [None]:
# print result summary 
return_results(model, test_df.values, test_target_df.values)

# create a loss & accuracy subplot
f, ax = plt.subplots(figsize=(14, 6), ncols=2)

# plot results of each model
plot_model_results(history, "g", 1)

## Self Influence 
#### Incorprating the self-influence code outlined in [TracIn paper](https://github.com/frederick0329/TracIn/blob/master/imagenet/resnet50_imagenet_self_influence.ipynb). 

In [None]:
# method to calculate self-influence of batch members.
@tf.function
def run_self_influence(features, labels, models):
    self_influences = []
    for m in models:
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(m.trainable_weights)
            probs = m(features, training=False)
            loss = tf.keras.losses.binary_crossentropy(tf.reshape(labels, shape=(-1,1)), probs, from_logits=True)
        grads = tape.jacobian(loss, m.trainable_weights)
        scores = tf.add_n([tf.math.reduce_sum(
            grad * grad, axis=tf.range(1, tf.rank(grad), 1)) 
            for grad in grads])
        self_influences.append(scores)  

    # using probs from last checkpoint
    probs, predicted_labels = tf.math.top_k(probs, k=1)
    return tf.math.reduce_mean(tf.stack(self_influences, axis=-1), axis=-1), labels, probs, predicted_labels

# method to concatenate all of the batch results together
def memorisation_results(memorisation, features, labels, probs, predicted_labels):
    result_dictionary = {
        "memorisation": np.array(np.concatenate(memorisation)),
        "features": np.concatenate(features),
        "labels": np.concatenate(labels),
        "probs": np.concatenate(probs),
        "predicted_labels": np.concatenate(predicted_labels)
    }
    return result_dictionary

This method incorprates **run_self_influence** and **memorisation_results** to return results for any given model scenario. It is used to study the comparisons between different CP memorisation scores.

In [None]:
def batch_self_influence(train_ds, models):
    ds_memorisation = []
    ds_features = []
    ds_labels = []
    ds_probs = []
    ds_predicted_labels = []

    start = time.time()
    for features, labels in train_ds:
        memorisation_score, labels, probs, predictied_labels = run_self_influence(features, labels, models)
        ds_memorisation.append(memorisation_score)
        ds_features.append(features)
        ds_labels.append(labels)
        ds_probs.append(probs)
        ds_predicted_labels.append(predictied_labels)
    end = time.time()
    print("Total time:", datetime.timedelta(seconds=end - start))
    
    return memorisation_results(ds_memorisation, ds_features, ds_labels, ds_probs, ds_predicted_labels)

A method to load the desired model weights of a single or a list of epochs.

In [None]:
def return_models(epochs):
    loaded_models = []
    for epoch in epochs:
        path = "{}/cp_training_adult/cp-00{:02d}".format(os.getcwd(), epoch)
        if os.path.exists(path) == False:
            print ("File not found: cp-00{:02d}".format(epoch))
        else:
            model = tf.keras.models.load_model("cp_training_adult/cp-00{:02d}".format(epoch))
            loaded_models.append(model)
    return loaded_models

Method that uses **batch_self_influence** to generate self-influence results for all the models loaded in *models*. 
- First result is for the zeroth epoch
- The last result is an averaged result across all the epochs, except the zeroth epoch
- Returns a list of results

In [None]:
def get_results(train_ds, models):
    results = [] 
    if len(models) > 1:
        for model in models:
            results.append(batch_self_influence(train_ds, [model]))
    results.append(batch_self_influence(train_ds, models[1:])) # ave. self-influence
    return results

## Create influence-based ordered dataset
Running the self influence method over the entire training dataset in batches. The output for memorisation score, labels and probabilities are stored in lists specified.

In [None]:
# convert numpy array to a tensor to divide into batches
train_ds = (
    tf.data.Dataset.from_tensor_slices((train_df.values, train_target_df.values))
    .batch(BATCH_SIZE)
)

Results from SGD optimizer with fixed learning rate (0.01)

In [None]:
# model index is later used to label plots
model_cps = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50]
models = return_models(model_cps)

In [None]:
# returns all model results in a list 
#  > zeroth index holds averaged CP
results = get_results(train_ds, models)

Save results into a pickle file to analyse in **adult_analysis.ipynb**.

In [None]:
import pickle

# save options
EXTENSION = "scan_results_shuffled_1000"

# store data (serialize)
with open('results/{}.pickle'.format(EXTENSION), 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
