In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from VAD_util import calculate_lambda, prediction_transformation

# fix random seed so it's easier to reproduce the result
np.random.seed(1234)

num_features = 20
sigma = 0.1
num_examples = 10000
num_examples_train = 3000
alphas_val = [0.02, 0.05, 0.1]
num_simulation = 1000
S_GROUP = 2

result_keys = [
    'total_calibration_p',
    'total_calibration_y',
    'positive_ratio_after_selection',
    'positive_ratio_train',
    'positive_ratio_test',
    'Ground truth',
    'calibration_y_true_p',
    'calibration_p_true_p',
    'calibration_p_y_true_p',
    'Vanilla',
    'Vanilla_p',
    'Validation',
    'Validation 0.05',
    'Validation 0.1',
    'VAD',
    'VAD_p',
    'VAD prob',
    'VAD prob_p',
    'Lambda logit',
    'Lambda prob',
    'Mu logit',
    'Mu prob',
    'Log Loss Logit Improvement',
    'Log Loss Prob Improvement',
]

def generate_data(num_examples, mu):
    x = np.random.normal(mu, sigma, (num_examples, num_features))
    p = 1 / (1 + np.exp(- np.sum(x, axis=1)))
    y = np.random.binomial(1, p)
    return x, y, p

def bootstrap_data(x, y):
    num_examples = x.shape[0]
    ind = np.random.choice(np.arange(num_examples), num_examples, replace=True)
    return x[ind], y[ind]

def VAD_method(num_simulation, num_group, method, alpha, train_data, val_data, test_data, test_val_data):

    def calculate_logit(X, clf):
        return np.squeeze(X @ clf.coef_.T + clf.intercept_)

    def append_result(result, curr):
        for result_key in result_keys:
            result[result_key].append(curr[result_key])
        return result

    assert method in ['bootstrap', 'seperate_data']
    x_train_array, y_train_array, p_train_array = train_data
    x_val_array, y_val_array, p_val_array = val_data
    x_test_val_array, _, _ = test_val_data 
    x_test_array, y_test_array, p_test_array = test_data 

    result = {}
    for result_key in result_keys:
        result[result_key] = []

    for k in range(num_simulation):
        x_train, y_train, p_train = x_train_array[k], y_train_array[k], p_train_array[k]
        x_val, y_val, p_val = x_val_array[k], y_val_array[k], p_val_array[k]
        x_test, y_test, p_test = x_test_array[k], y_test_array[k], p_test_array[k]
        x_test_val = x_test_val_array[k]
        clf = LogisticRegression(penalty='none', random_state=0).fit(x_train, y_train)
        p_predicted = clf.predict_proba(x_test)[:, 1]
        p_predicted_test_val = clf.predict_proba(x_test_val)[:, 1]
        p_predicted_val = clf.predict_proba(x_val)[:, 1]
        logit_predicted = calculate_logit(x_test, clf)
        logit_predicted_test_val = calculate_logit(x_test_val, clf)
        num_examples_test_val = x_test_val.shape[0]
        num_examples_test = x_test.shape[0]
        p_predicted_subgroup = np.zeros((num_examples_test_val, num_group))
        logit_predicted_subgroup = np.zeros((num_examples_test_val, num_group))
        for i in range(num_group):
            x_train_bootstrap, y_train_bootstrap = bootstrap_data(x_train, y_train)
            clf_i = LogisticRegression(penalty='none', random_state=0).fit(x_train_bootstrap, y_train_bootstrap)
            p_predicted_subgroup[:, i] = clf_i.predict_proba(x_test_val)[:, 1]
            logit_predicted_subgroup[:, i] = calculate_logit(x_test_val, clf_i)

        lambda_p_logit = calculate_lambda(p_predicted_subgroup, p_predicted_test_val, 'logit', logit_predicted_subgroup, logit_predicted_test_val)
        lambda_p_prob = calculate_lambda(p_predicted_subgroup, p_predicted_test_val, 'probability', logit_predicted_subgroup, logit_predicted_test_val)

        choose_num_examples = int(num_examples_test * alpha)
        ind = np.argpartition(p_predicted, -choose_num_examples)[-choose_num_examples:]
        val_calibration_report = {}
        for alpha_val in alphas_val:
            choose_num_examples_val = int(num_examples_test_val * alpha_val)
            ind_val = np.argpartition(p_predicted_val, -choose_num_examples_val)[-choose_num_examples_val:]
            val_calibration_report[int(alpha_val * 1000)] = np.sum(p_predicted_val[ind_val]) / np.sum(y_val[ind_val])
        ind_true_p =  np.argpartition(p_test, -choose_num_examples)[-choose_num_examples:]

        log_loss = np.sum(-(y_test[ind] * np.log(p_predicted[ind]) + (1 - y_test[ind]) * np.log(1 - p_predicted[ind])))
        refined_prediction_logit = prediction_transformation(p_predicted, ind, lambda_p_logit, 'logit', np.mean(p_predicted_test_val), np.mean(logit_predicted_test_val), logit_predicted)
        refined_prediction_prob = prediction_transformation(p_predicted, ind, lambda_p_prob, 'probability', np.mean(p_predicted_test_val), np.mean(logit_predicted_test_val), logit_predicted)
        refined_log_loss_logit = np.sum(-(y_test[ind] * np.log(refined_prediction_logit) + (1 - y_test[ind]) * np.log(1 - refined_prediction_logit)))
        refined_log_loss_prob = np.sum(-(y_test[ind] * np.log(refined_prediction_prob) + (1 - y_test[ind]) * np.log(1 - refined_prediction_prob)))

        curr = {
            'total_calibration_p': np.sum(p_predicted) / np.sum(p_test),
            'total_calibration_y': np.sum(p_predicted) / np.sum(y_test),
            'positive_ratio_after_selection': np.sum(y_test[ind]) / len(y_test[ind]),
            'positive_ratio_train': np.sum(y_train) / len(y_train),
            'positive_ratio_test': np.sum(y_test) / len(y_test),
            'Ground truth': np.sum(p_test[ind]) / np.sum(y_test[ind]),
            'calibration_y_true_p': np.sum(p_predicted[ind_true_p]) / np.sum(y_test[ind_true_p]),
            'calibration_p_true_p': np.sum(p_predicted[ind_true_p]) / np.sum(p_test[ind_true_p]),
            'calibration_p_y_true_p': np.sum(y_test[ind_true_p]) / np.sum(p_test[ind_true_p]),
            'Vanilla': np.sum(p_predicted[ind]) / np.sum(y_test[ind]),
            'Vanilla_p': np.sum(p_predicted[ind]) / np.sum(p_test[ind]),
            'Validation': np.sum(p_predicted[ind]) / np.sum(y_test[ind]) / val_calibration_report[20],
            'Validation 0.05': np.sum(p_predicted[ind]) / np.sum(y_test[ind]) / val_calibration_report[50],
            'Validation 0.1': np.sum(p_predicted[ind]) / np.sum(y_test[ind]) / val_calibration_report[100],
            'VAD': np.sum(refined_prediction_logit) / np.sum(y_test[ind]),
            'VAD_p': np.sum(refined_prediction_logit) / np.sum(p_test[ind]),
            'VAD prob': np.sum(refined_prediction_prob) / np.sum(y_test[ind]),
            'VAD prob_p': np.sum(refined_prediction_prob) / np.sum(p_test[ind]),
            'Lambda logit': lambda_p_logit,
            'Lambda prob': lambda_p_prob,
            'Mu logit': np.mean(logit_predicted_test_val),
            'Mu prob': np.mean(p_predicted_test_val),
            'Log Loss Logit Improvement': (refined_log_loss_logit - log_loss) / log_loss * 100,
            'Log Loss Prob Improvement': (refined_log_loss_prob - log_loss) / log_loss * 100,
        }
        result = append_result(result, curr)

    print("alpha: ", alpha)
    for result_key in result_keys:
        result[result_key] = np.array(result[result_key])
        print("mean of", result_key, ": ", np.mean(result[result_key]))
    print()

    return result

def generate_data_array(num_simulation, num_examples, mu):
    x_array = []
    y_array = []
    p_array = []
    for k in range(num_simulation):
        x, y, p = generate_data(num_examples, mu)
        x_array.append(x)
        y_array.append(y)
        p_array.append(p)
    return (x_array, y_array, p_array)


mu = -0.05
mu_train = -0.05
train_data = generate_data_array(num_simulation, num_examples_train, mu_train)
val_data = generate_data_array(num_simulation, num_examples_train, mu_train)
test_data = generate_data_array(num_simulation, num_examples, mu)
test_val_data = generate_data_array(num_simulation, num_examples, mu)

report = {}
reported_result_keys = [
    'Vanilla',
    'Validation',
    'Validation 0.05',
    'Validation 0.1',
    'VAD',
    'VAD prob',
]
alphas = [0.02, 0.05, 0.1]
for alpha in alphas:
    result = VAD_method(num_simulation, S_GROUP, 'bootstrap', alpha, train_data, val_data, test_data, test_val_data)
    alpha_key = int(alpha * 1000)
    report[alpha_key] = {}
    for result_key in reported_result_keys:
        curr_result = result[result_key]
        total_num_result = curr_result.shape[0]
        report[alpha_key][result_key] = (np.mean(curr_result), np.std(curr_result) / np.sqrt(total_num_result))

alpha:  0.02
mean of total_calibration_p :  0.9983254908541483
mean of total_calibration_y :  0.9986705139579292
mean of positive_ratio_after_selection :  0.4988100000000001
mean of positive_ratio_train :  0.27695766666666666
mean of positive_ratio_test :  0.2774246
mean of Ground truth :  1.0080126141538204
mean of calibration_y_true_p :  1.0037009963251953
mean of calibration_p_true_p :  0.997052125766921
mean of calibration_p_y_true_p :  0.9981110446276411
mean of Vanilla :  1.0891093958203024
mean of Vanilla_p :  1.080432238506638
mean of Validation :  1.0193602336925756
mean of Validation 0.05 :  1.0310746711722194
mean of Validation 0.1 :  1.0463811687616262
mean of VAD :  0.9976130771770413
mean of VAD_p :  0.9896388876150911
mean of VAD prob :  1.0059283773511145
mean of VAD prob_p :  0.9979013031961572
mean of Lambda logit :  0.8412492314016639
mean of Lambda prob :  0.8410224843807282
mean of Mu logit :  -1.0106507392068786
mean of Mu prob :  0.2769414517450708
mean of Log Lo