# Instructions

1. git clone the repo and upload the whole repo to google drive
2. use Google Colab to execute the code (including model training, inference, and applying VAD)

# Section 1: DLRM training && inference

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# enter your working directory

# %cd /content/gdrive/MyDrive/your_folder/VAD/dlrm/

In [None]:
# install the required package

# !pip install "git+https://github.com/mlperf/logging.git@beaf26d"

In [None]:
# command to train DLRM model && do inference on test / validation dataset
# to fully reproduce the result, please train the model on 15M data instead of 100k example data


# %%shell

# for VAR in {1..1}
# do 
#     python dlrm_s_pytorch.py \
#         --arch-sparse-feature-size=16 \
#         --arch-mlp-bot="13-512-256-64-16" \
#         --arch-mlp-top="512-256-1" \
#         --data-generation=dataset \
#         --data-set=kaggle \
#         --raw-data-file=./input/train_last_100k_as_example.txt \
#         --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz \
#         --save-model=./model/model_example.pt \
#         --numpy-rand-seed=${VAR} \
#         --data-randomize=total \
#         --lr-num-warmup-steps=0 \
#         --lr-decay-start-step=0 \
#         --loss-function=bce \
#         --round-targets=True \
#         --mlperf-logging \
#         --nepochs=2 \
#         --learning-rate=0.1 \
#         --mini-batch-size=128 \
#         --print-freq=81920 \
#         --print-time \
#         --test-mini-batch-size=16384 \
#         --test-num-workers=4 \
#         --test-freq=655360

#     python dlrm_s_pytorch.py \
#         --arch-sparse-feature-size=16 \
#         --arch-mlp-bot="13-512-256-64-16" \
#         --arch-mlp-top="512-256-1" \
#         --data-generation=dataset \
#         --data-set=kaggle \
#         --raw-data-file=./input/train_last_100k_as_example.txt \
#         --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz \
#         --load-model=./model/model_example.pt \
#         --dump-json-file=./result/result_example.json \
#         --test-data-split=test \
#         --inference-only \
#         --data-randomize=total \
#         --lr-num-warmup-steps=0 \
#         --lr-decay-start-step=0 \
#         --loss-function=bce \
#         --round-targets=True \
#         --mlperf-logging \
#         --nepochs=1 \
#         --learning-rate=0.1 \
#         --mini-batch-size=128 \
#         --print-freq=81920 \
#         --print-time \
#         --test-mini-batch-size=16384 \
#         --test-num-workers=4 \
#         --test-freq=655360

#     python dlrm_s_pytorch.py \
#         --arch-sparse-feature-size=16 \
#         --arch-mlp-bot="13-512-256-64-16" \
#         --arch-mlp-top="512-256-1" \
#         --data-generation=dataset \
#         --data-set=kaggle \
#         --raw-data-file=./input/train_last_100k_as_example.txt \
#         --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz \
#         --load-model=./model/model_example.pt \
#         --dump-json-file=./result/result_example_val.json \
#         --test-data-split=val \
#         --inference-only \
#         --data-randomize=total \
#         --lr-num-warmup-steps=0 \
#         --lr-decay-start-step=0 \
#         --loss-function=bce \
#         --round-targets=True \
#         --mlperf-logging \
#         --nepochs=1 \
#         --learning-rate=0.1 \
#         --mini-batch-size=128 \
#         --print-freq=81920 \
#         --print-time \
#         --test-mini-batch-size=16384 \
#         --test-num-workers=4 \
#         --test-freq=655360
# done

# Section 2: Apply VAD

In [None]:
# %cd /content/gdrive/MyDrive/your_folder/VAD/

In [None]:
import codecs, json
import numpy as np
import tensorflow as tf

from sklearn.calibration import IsotonicRegression
import calibration_utils
import calibration_calibrator

def get_y_and_p(file_name):
    obj_text = codecs.open(file_name, 'r', encoding='utf-8').read()
    data = json.loads(obj_text)
    y = np.squeeze(np.array(data['targets']))
    p = np.squeeze(np.array(data['scores']))
    return y, p

def assert_test_val_different(y_test, y_val):
    # make sure test data and val data are indeed different (i.e. no bug when do model inference)
    if len(y_test) == len(y_val):
        assert np.sum(y_test == y_val) < len(y_val)

def read_data_from_json(
    json_name, 
    json_name_val, 
    num_group,
):

    def get_test_and_val(y_1, p_1, y_2, p_2):
        assert_test_val_different(y_1, y_2)
        y = np.hstack((y_1, y_2))
        p = np.hstack((p_1, p_2))
        num_examples = y.shape[0]
        num_examples_val = num_examples // 10
        num_examples -= num_examples_val
        return y[:num_examples], p[:num_examples], y[num_examples:], p[num_examples:]

    y_1, p_1 = get_y_and_p(json_name.format(1))
    y_2, p_2 = get_y_and_p(json_name_val.format(1))
    y, p, y_val, p_val = get_test_and_val(y_1, p_1, y_2, p_2)
    num_examples = y.shape[0]
    num_examples_val = y_val.shape[0]

    p_predicted_subgroup = np.zeros((num_examples, num_group))
    p_predicted_subgroup_val = np.zeros((num_examples_val, num_group))
    # http://ethen8181.github.io/machine-learning/model_selection/prob_calibration/prob_calibration.html#Calibration-Model
    p_predicted_subgroup_calibrated = {
        "hist_bin": np.zeros((num_examples, num_group)),
        "platt": np.zeros((num_examples, num_group)),
        "platt_hist": np.zeros((num_examples, num_group)),
        "isotonic": np.zeros((num_examples, num_group)),
    }

    y_predicted_subgroup = np.zeros((num_examples, num_group))
    y_predicted_subgroup_val = np.zeros((num_examples_val, num_group))
    
    for i in range(num_group):
        y_1, p_1 = get_y_and_p(json_name.format(i+1))
        y_2, p_2 = get_y_and_p(json_name_val.format(i+1))
        y, p, y_val, p_val = get_test_and_val(y_1, p_1, y_2, p_2)
        y_predicted_subgroup[:, i] = y
        p_predicted_subgroup[:, i] = p
        y_predicted_subgroup_val[:, i] = y_val
        p_predicted_subgroup_val[:, i] = p_val

        histogram = calibration_calibrator.HistogramCalibrator(n_bins=50)
        histogram.fit(p_val, y_val)
        histogram_probs = histogram.predict(p)
        p_predicted_subgroup_calibrated["hist_bin"][:, i] = histogram_probs

        platt = calibration_calibrator.PlattCalibrator(log_odds=True)
        platt.fit(p_val, y_val)
        platt_probs = platt.predict(p)
        p_predicted_subgroup_calibrated["platt"][:, i] = platt_probs

        platt_histogram = calibration_calibrator.PlattHistogramCalibrator(n_bins=50, log_odds=True)
        platt_histogram.fit(p_val, y_val)
        platt_histogram_probs = platt_histogram.predict(p)
        p_predicted_subgroup_calibrated["platt_hist"][:, i] = platt_histogram_probs

        isotonic = IsotonicRegression(
            out_of_bounds='clip',
            y_min=p_val.min(),
            y_max=p_val.max(),
        )
        isotonic.fit(p_val, y_val)
        isotonic_probs = isotonic.predict(p)
        p_predicted_subgroup_calibrated["isotonic"][:, i] = isotonic_probs

    # make sure data is correct
    for i in range(num_group):
        assert np.sum(y_predicted_subgroup[:,0] == y_predicted_subgroup[:,i]) == num_examples
        assert np.sum(y_predicted_subgroup_val[:,0] == y_predicted_subgroup_val[:,i]) == num_examples_val

    return p_predicted_subgroup, p_predicted_subgroup_calibrated, p_predicted_subgroup_val, y_predicted_subgroup[:, 0], y_predicted_subgroup_val[:, 0]

def get_ood_construct_model_predictions():
    json_name_nn_select = './dlrm/result/result_model_select.json'
    json_name_nn_select_val = './dlrm/result/result_model_select_val.json'
    y_1, p_1 = get_y_and_p(json_name_nn_select)
    y_2, p_2 = get_y_and_p(json_name_nn_select_val)
    assert_test_val_different(y_1, y_2)
    y = np.hstack((y_1, y_2))
    p = np.hstack((p_1, p_2))
    return y, p

json_name = "./dlrm/result/result_model_{}.json"
json_name_val = "./dlrm/result/result_model_{}_val.json"

# The paper replicates the experiment 40 times, so in order to reproduce the results,
# you should train 40 * 2 = 80 models and set num_group = 80
num_group = 4

bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
ood_y, ood_p = get_ood_construct_model_predictions()

p_predicted_subgroup_original, p_predicted_subgroup_calibrated_original, p_predicted_subgroup_original_val, y_original, y_original_val = read_data_from_json(
    json_name, 
    json_name_val, 
    num_group,
)

In [None]:
def preprocess_test_data(
    p_predicted_subgroup, 
    y,
    do_p_selection,
    ood_p,
    ood_y,
):
    num_examples = p_predicted_subgroup.shape[0]
    assert np.sum(ood_y[:num_examples] == y) == len(y)
    selected_ind = list(range(num_examples))
    if do_p_selection:
        selected_ind = []
        for i in range(num_examples):
            select = np.random.binomial(1, 1 - ood_p[i])
            if select > 0.5:
                selected_ind.append(i)
    selected_ind = np.array(selected_ind)
    print(f"Selected {len(selected_ind)} samples after test pre-process ({int(100 * len(selected_ind) // num_examples)}% data)")
    print("After pre-process test positive ratio: ", np.sum(y[selected_ind]) / len(selected_ind))
    print()
    return selected_ind

do_p_selection = True
num_group_bootstrap = 2
np.random.seed(1)

selected_ind = preprocess_test_data(
    p_predicted_subgroup_original, 
    y_original,
    do_p_selection,
    ood_p,
    ood_y,
)

p_predicted_subgroup = p_predicted_subgroup_original[selected_ind]
y = y_original[selected_ind]

p_predicted_subgroup_calibrated = {}
for calibration_method in ["hist_bin", "platt", "platt_hist", "isotonic"]:
    p_predicted_subgroup_calibrated[calibration_method] = p_predicted_subgroup_calibrated_original[calibration_method][selected_ind]

In [None]:
from VAD_util import calculate_lambda, prediction_transformation

def generate_report(
    y, 
    p_predicted_subgroup, 
    p_predicted_subgroup_pre_cali,
    alpha, 
    print_individual_result,
    num_group_bootstrap,
    p_predicted_subgroup_val=None,
):
    def append_result(result, curr):
        for result_key in result_keys:
            result[result_key].append(curr[result_key])
        return result

    def calculate_ECE_MCE_Brier(y_prob, y_true, n_bins):
        sorted_indices = np.argsort(y_prob)
        sorted_y_true = y_true[sorted_indices]
        sorted_y_prob = y_prob[sorted_indices]
        binned_y_true = np.array_split(sorted_y_true, n_bins)
        binned_y_prob = np.array_split(sorted_y_prob, n_bins)
        ece_errors = 0.0
        mce_errors = 0.0
        brier_errors = 0.0
        for bin_y_true, bin_y_prob in zip(binned_y_true, binned_y_prob):
            avg_y_true = np.mean(bin_y_true)
            avg_y_score = np.mean(bin_y_prob)
            ce_error = np.abs(avg_y_score - avg_y_true)
            ece_errors += ce_error / n_bins
            mce_errors = max(mce_errors, ce_error)
            brier_errors += ((avg_y_score - avg_y_true) ** 2) / n_bins
        return round(ece_errors, 4), round(mce_errors, 4), round(brier_errors, 4)

    num_examples, num_group = p_predicted_subgroup.shape
    num_examples_test_val = num_examples // 10
    num_examples -= num_examples_test_val
    p_predicted_subgroup_test_val = p_predicted_subgroup[:num_examples_test_val, :]
    p_predicted_subgroup = p_predicted_subgroup[num_examples_test_val:, :]
    y = y[num_examples_test_val:]
    assert num_examples == p_predicted_subgroup.shape[0]
    p_predicted_subgroup_pre_cali_test_val = p_predicted_subgroup_pre_cali[:num_examples_test_val, :]

    result = {}
    for result_key in result_keys:
        result[result_key] = []

    for i in range(num_group):
        if i >= num_group // num_group_bootstrap:
            break
        p = p_predicted_subgroup[:, i]
        ind = np.argpartition(p, -int(num_examples * alpha))[-int(num_examples * alpha):]
        pos_rate = np.sum(y) / num_examples

        p_predicted_subgroup_test_val_choosen = np.zeros((num_examples_test_val, num_group_bootstrap))
        for j in range(num_group_bootstrap):
            p_predicted_subgroup_test_val_choosen[:, j] = p_predicted_subgroup_pre_cali_test_val[:, (i + j * num_group // num_group_bootstrap) % num_group]
        lambda_p_logit = calculate_lambda(p_predicted_subgroup_test_val_choosen, p_predicted_subgroup_pre_cali_test_val[:, i], 'logit')
        lambda_p_prob = calculate_lambda(p_predicted_subgroup_test_val_choosen, p_predicted_subgroup_pre_cali_test_val[:, i], 'probability')
        if p_predicted_subgroup_val is not None:
            num_examples_val = p_predicted_subgroup_val.shape[0]
            p_predicted_subgroup_val_choosen = np.zeros((num_examples_val, num_group_bootstrap))
            for j in range(num_group_bootstrap):
                p_predicted_subgroup_val_choosen[:, j] = p_predicted_subgroup_val[:, (i + j * num_group // num_group_bootstrap) % num_group]
            lambda_p_logit_in_distribution = calculate_lambda(p_predicted_subgroup_val_choosen, p_predicted_subgroup_val[:, i], 'logit')
            lambda_p_prob_in_distribution = calculate_lambda(p_predicted_subgroup_val_choosen, p_predicted_subgroup_val[:, i], 'probability')
            lambda_p_logit /= lambda_p_logit_in_distribution
            lambda_p_prob /= lambda_p_prob_in_distribution

        p_mean_test_val = np.mean(p_predicted_subgroup_test_val[:, i])
        p_mean_logit_test_val = np.mean(np.log(p_predicted_subgroup_test_val[:, i]/(1-p_predicted_subgroup_test_val[:, i])))
        refined_prediction_logit = prediction_transformation(p, ind, lambda_p_logit, 'logit', p_mean_test_val, p_mean_logit_test_val)
        refined_prediction_prob = prediction_transformation(p, ind, lambda_p_prob, 'probability', p_mean_test_val, p_mean_logit_test_val)
        log_loss_original = bce(y[ind], p[ind]).numpy()
        log_loss_logit = bce(y[ind], refined_prediction_logit).numpy()
        log_loss_prob = bce(y[ind], refined_prediction_prob).numpy()
        log_loss_logit_improve = (log_loss_logit - log_loss_original) / log_loss_original * 100
        log_loss_prob_improve = (log_loss_prob - log_loss_original) / log_loss_original * 100

        n_bins = 50
        Vanilla_ECE, Vanilla_MCE, Vanilla_Brier = calculate_ECE_MCE_Brier(p[ind], y[ind], n_bins)
        VAD_ECE, VAD_MCE, VAD_Brier = calculate_ECE_MCE_Brier(refined_prediction_logit, y[ind], n_bins)
        VAD_Prob_ECE, VAD_Prob_MCE, VAD_Prob_Brier = calculate_ECE_MCE_Brier(refined_prediction_prob, y[ind], n_bins)

        curr = {
            'total_calibration': np.sum(p) / np.sum(y),
            'positive_ratio_after_selection': np.sum(y[ind]) / len(y[ind]),
            'Vanilla': np.sum(p[ind]) / np.sum(y[ind]),
            'Vanilla ECE': Vanilla_ECE,
            'Vanilla MCE': Vanilla_MCE,
            'Vanilla Brier': Vanilla_Brier,
            'VAD': np.sum(refined_prediction_logit) / np.sum(y[ind]),
            'VAD ECE': VAD_ECE,
            'VAD MCE': VAD_MCE,
            'VAD Brier': VAD_Brier,
            'VAD prob': np.sum(refined_prediction_prob) / np.sum(y[ind]),
            'VAD prob ECE': VAD_Prob_ECE,
            'VAD prob MCE': VAD_Prob_MCE,
            'VAD prob Brier': VAD_Prob_Brier,
            'Lambda logit': lambda_p_logit,
            'Lambda prob': lambda_p_prob,
            'Mu logit': p_mean_logit_test_val,
            'Mu prob': p_mean_test_val,
            'Log Loss Logit Improvement': log_loss_logit_improve,
            'Log Loss Prob Improvement': log_loss_prob_improve,
        }
        result = append_result(result, curr)
        
        log_loss = bce(y, p).numpy()
        if print_individual_result:
            print("LogLoss: ", log_loss)
            for result_key in result_keys:
                print(result_key, ": ", curr[result_key])
            print()

    print("Num Group: ", num_group)
    print("Alpha: ", alpha)
    for result_key in result_keys:
        result[result_key] = np.array(result[result_key])
        print("mean of ", result_key, ": ", np.mean(result[result_key]))
    print()

    return result

def generate_multiple_report(
    p_predicted_subgroup, 
    p_predicted_subgroup_pre_cali,
    y, 
    p_predicted_subgroup_val=None,
):
    num_group = p_predicted_subgroup.shape[1]
    report = {}
    alphas = [0.02, 0.1]
    for alpha in alphas:
        result = generate_report(
            y, 
            p_predicted_subgroup, 
            p_predicted_subgroup_pre_cali,
            alpha, 
            False,
            num_group_bootstrap,
            p_predicted_subgroup_val,
        )
        alpha_key = int(1000 * alpha)
        report[alpha_key] = {}
        for result_key in reported_result_keys:
            curr_result = result[result_key]
            total_num_result = curr_result.shape[0]
            report[alpha_key][result_key] = (np.mean(curr_result), np.std(curr_result) / np.sqrt(total_num_result))
    return report


result_keys = [
    'total_calibration',
    'positive_ratio_after_selection',
    'Vanilla',
    'Vanilla ECE',
    'Vanilla MCE',
    'Vanilla Brier',
    'VAD',
    'VAD ECE',
    'VAD MCE',
    'VAD Brier',
    'VAD prob',
    'VAD prob ECE',
    'VAD prob MCE',
    'VAD prob Brier',
    'Lambda logit',
    'Lambda prob',
    'Mu logit',
    'Mu prob',
    'Log Loss Logit Improvement',
    'Log Loss Prob Improvement',
]

reported_result_keys = [
    'Vanilla',
    'Vanilla ECE',
    'Vanilla MCE',
    'VAD',
    'VAD ECE',
    'VAD MCE',
]

final_report = {}
final_report["none"] = generate_multiple_report(
    p_predicted_subgroup, 
    p_predicted_subgroup,
    y,
)

print("Calibration Result (calibrated on all data)\n")

for calibration_method in ["hist_bin", "platt", "platt_hist", "isotonic"]:
    print(f"Calibration Method: {calibration_method}")
    final_report[calibration_method] = generate_multiple_report(
        p_predicted_subgroup_calibrated[calibration_method], 
        p_predicted_subgroup,
        y, 
        p_predicted_subgroup_original_val,
    )