In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
plt.rc('font', family='serif')
plt.rc('axes', grid=False)
plt.rc('ytick', direction='out', color='gray')
plt.rc('xtick', direction='out', color='gray')
plt.rcParams.update({'font.size': 12})

# Parameters for the dataset
n_samples = 10000
n_features = 2
n_informative = 2
n_redundant = 0
n_clusters_per_class = 2
weights = [0.5, 0.5]
class_sep = 1.2
flip_y = 0

# Generate the dataset
X, y = make_classification(n_samples=n_samples, 
                           n_features=n_features, 
                           n_informative=n_informative, 
                           n_redundant=n_redundant, 
                           n_clusters_per_class=n_clusters_per_class, 
                           weights=weights, 
                           class_sep=class_sep,
                           flip_y=0,
                           random_state=42)

X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_test, X_v, y_test, y_v = train_test_split(X_, y_, test_size=0.5, random_state=42, stratify=y_)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)
X_v = sc.transform(X_v)

In [None]:
# Visualizing the dataset
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
ax.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], s=3, alpha=0.2, c='blue', label='$y = 1$')
ax.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], s=3, alpha=0.2, c='red', label='$y = 0$')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.legend()
fig.savefig('../outputs/figures/simulation_scatter.pdf', dpi=300, bbox_inches='tight')

In [None]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), '..'))

from src.utils.data_valuation import *

knn = compute_knn_shapley(X_train, y_train, X_test, y_test, k=5)

In [None]:
import pandas as pd

ind_knn = pd.Series(knn).sort_values(ascending=True).index
X_hard = X_train[ind_knn[:250]] # hardest 5%
y_hard = y_train[ind_knn[:250]]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
ax.scatter(X_hard[y_hard == 1, 0], X_hard[y_hard == 1, 1], s=3, alpha=0.5, c='blue')
ax.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], s=3, alpha=0.05, c='blue')
ax.scatter(X_hard[y_hard == 0, 0], X_hard[y_hard == 0, 1], s=3, alpha=0.5, c='red')
ax.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], s=3, alpha=0.05, c='red')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
fig.savefig('../outputs/figures/simulation_scatter_hard.pdf', dpi=300, bbox_inches='tight');

In [None]:
import json
import xgboost as xgb

import sys, os
sys.path.append(os.path.join(os.getcwd(), '..'))

from src.utils.amex_metric import *

with open('../config/xgboost.json', 'r') as f:
    params = json.load(f)
dtrain = xgb.DMatrix(X_train, y_train)
dv = xgb.DMatrix(X_v, y_v)

bst = xgb.train(params, dtrain, num_boost_round=9999, verbose_eval=0,
                evals=[(dtrain, 'train'), (dv, 'v')], custom_metric=amex_scorer, 
                early_stopping_rounds=100, maximize=True)
print(amex_metric(y_v, bst.predict(dv, iteration_range=(0, bst.best_iteration + 1))))

In [None]:
dict_gain = bst.get_score(importance_type='gain')
dict_weight = bst.get_score(importance_type='weight')
for ix in range(188):
    key = f'f{ix}'
    if key not in dict_gain:
        dict_gain[key] = 0
        dict_weight[key] = 0
df_importance = pd.DataFrame({'feature': dict_gain.keys(),
                              'gain': dict_gain.values(),
                              'weight': dict_weight.values()})

df_importance.head(2)

In [None]:
import copy
import pickle

from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer, CTGANSynthesizer
from ctgan import TVAE, CTGAN
from baytune import BTBSession
from baytune.tuning import Tunable
from baytune.tuning import hyperparams as hp

from src.utils.train_xgb import train_xgb

def append_to_pickle_file(pickle_file, new_data):
    try:
        with open(pickle_file, 'rb') as f:
            existing_data = pickle.load(f)
        if not isinstance(existing_data, list):
            existing_data = [existing_data]
        existing_data.append(new_data)
        with open(pickle_file, 'wb') as f:
            pickle.dump(existing_data, f)
    except FileNotFoundError:
        with open(pickle_file, 'wb') as f:
            pickle.dump([new_data], f)

X_hard_df = pd.DataFrame(X_hard)
X_hard_df['target'] = y_hard

# Infer metadata: categorical and numeric features
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(X_hard_df)

# Compute average score on all data
tot_score, _ = train_xgb(X_train, y_train, X_v, y_v, n=10)

def get_xgboost_score(X_train, y_train):
    # Compute difference in average score with respect to all data
    new_score, _ = train_xgb(X_train, y_train, X_v, y_v, n=10)
    return new_score - tot_score

mods = {
    'TVAE': TVAESynthesizer,
}

synths = {
    'TVAE': TVAE,
}

def transform_dict(mod_name, hyperparams):
    params = {}
    if mod_name == 'TVAE':
        params['embedding_dim'] = hyperparams['embedding_dim']
        params['compress_dims'] = (hyperparams['compress_dims_0'], hyperparams['compress_dims_1'])
        params['decompress_dims'] = (hyperparams['decompress_dims_0'], hyperparams['decompress_dims_1'])
    return params

def scoring_function(mod_name, hyperparams):
    '''
    Scorer for a synthesizer: quantifies validation performance variation 
    when synthetic data is added to training dataset.
    '''
    global ind
    mod_class = mods[mod_name]
    params = transform_dict(mod_name, hyperparams)
    mod_instance = mod_class(metadata, 
                             save_path=f'../outputs/synthesizers/tuning_sim/{ind}.pkl',
                             epochs=500,
                             batch_size=100,
                             cuda='cuda:1',
                             patience=50,
                             weights=df_importance['gain'].values,
                             **params)
    mod_instance.fit(X_hard_df)
    # restore best model
    mod_instance = synths[mod_name].load(f'../outputs/synthesizers/tuning_sim/{ind}.pkl')
    ind += 1
    scores = []
    for _ in range(10): # Repeat 10 times to mitigate randomness
        synthetic_data = mod_instance.sample(len(y_hard))
        X_synth = synthetic_data.drop('target', axis=1).values
        y_synth = np.array(synthetic_data['target'])
        X_train_aug = np.vstack((X_train, X_synth))
        y_train_aug = np.concatenate((y_train, y_synth))
        scores.append(get_xgboost_score(X_train_aug, y_train_aug))
    hyperparams_log = copy.deepcopy(hyperparams)
    hyperparams_log['mod_name'] = mod_name
    append_to_pickle_file('../outputs/synthesizers/hyperparams_hard_sim.pkl', hyperparams_log)
    append_to_pickle_file('../outputs/synthesizers/scores_hard_sim.pkl', scores)
    return np.mean(scores)

# Candidate models and their hyperparameter sets
tunables = {
    'TVAE': Tunable({
    'embedding_dim': hp.IntHyperParam(min=16, max=256, default=64, step=1),
    'compress_dims_0': hp.IntHyperParam(min=16, max=256, default=128, step=1), 
    'compress_dims_1': hp.IntHyperParam(min=16, max=256, default=128, step=1),
    'decompress_dims_0': hp.IntHyperParam(min=16, max=256, default=128, step=1), 
    'decompress_dims_1': hp.IntHyperParam(min=16, max=256, default=128, step=1),
})
}

session = BTBSession(
    tunables=tunables,
    scorer=scoring_function,
    verbose=True
)

ind = 0
best_prop = session.run(10)
print(best_prop)

# Dump session results
with open('../outputs/synthesizers/session_hard_sim.pkl', "wb") as f:
    pickle.dump(session, f)

In [None]:
X_train_df = pd.DataFrame(X_train)
X_train_df['target'] = y_train

def scoring_function(mod_name, hyperparams):
    '''
    Scorer for a synthesizer: quantifies validation performance variation 
    when synthetic data is added to training dataset.
    '''
    global ind
    mod_class = mods[mod_name]
    params = transform_dict(mod_name, hyperparams)
    mod_instance = mod_class(metadata, 
                             save_path=f'../outputs/synthesizers/tuning_tot_sim/{ind}.pkl',
                             epochs=500,
                             batch_size=2000,
                             cuda='cuda:1',
                             patience=50,
                             weights=df_importance['gain'].values,
                             **params)
    mod_instance.fit(X_train_df)
    # restore best model
    mod_instance = synths[mod_name].load(f'../outputs/synthesizers/tuning_tot_sim/{ind}.pkl')
    ind += 1
    scores = []
    for _ in range(10): # Repeat 10 times to mitigate randomness
        synthetic_data = mod_instance.sample(len(y_train) // 20)
        X_synth = synthetic_data.drop('target', axis=1).values
        y_synth = np.array(synthetic_data['target'])
        X_train_aug = np.vstack((X_train, X_synth))
        y_train_aug = np.concatenate((y_train, y_synth))
        scores.append(get_xgboost_score(X_train_aug, y_train_aug))
    hyperparams_log = copy.deepcopy(hyperparams)
    hyperparams_log['mod_name'] = mod_name
    append_to_pickle_file('../outputs/synthesizers/hyperparams_tot_sim.pkl', hyperparams_log)
    append_to_pickle_file('../outputs/synthesizers/scores_tot_sim.pkl', scores)
    return np.mean(scores)

session = BTBSession(
    tunables=tunables,
    scorer=scoring_function,
    verbose=True
)

ind = 0
best_prop = session.run(10)
print(best_prop)

# Dump session results
with open('../outputs/synthesizers/session_tot_sim.pkl', "wb") as f:
    pickle.dump(session, f)

In [None]:
with open('../outputs/synthesizers/scores_hard_sim.pkl', 'rb') as f:
    existing_data = np.array(pickle.load(f))
with open('../outputs/synthesizers/hyperparams_hard_sim.pkl', 'rb') as f:
    existing_params = pickle.load(f)

scores = np.mean(existing_data, axis=1)[-10:]
stds = 1.96 * np.std(existing_data, axis=1)[-10:] / np.sqrt(10)

from matplotlib.ticker import MaxNLocator

color = 'yellowgreen'
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
ax.errorbar(range(len(scores)), scores, yerr=stds, fmt='o', color=color, ecolor=color, capsize=3, markersize=3, alpha=1.)
ax.plot(range(len(scores)), scores, 'o', color=color, markersize=3, alpha=1)
ax.axhline(0, ls='--', lw=1, color='lightgray')
ax.set_xlim((-1, len(scores)))
ax.set_ylim((-0.0035, 0.004))
ax.set_xlabel('ID in Bayesian optimisation')
ax.set_ylabel('Variation in Gini after augmentation')
ax.set_title('Augment hardest $5\%$ by $100\%$')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.grid(axis='x')
fig.savefig('../outputs/figures/simulation_hard.pdf', dpi=300, bbox_inches='tight');

In [None]:
with open('../outputs/synthesizers/scores_tot_sim.pkl', 'rb') as f:
    existing_data_tot = np.array(pickle.load(f))
with open('../outputs/synthesizers/hyperparams_tot_sim.pkl', 'rb') as f:
    existing_params_tot = pickle.load(f)

scores = np.mean(existing_data_tot, axis=1)
stds = 1.96 * np.std(existing_data_tot, axis=1) / np.sqrt(10)

color = 'yellowgreen'
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
ax.errorbar(range(len(scores)), scores, yerr=stds, fmt='o', color=color, ecolor=color, capsize=3, markersize=3, alpha=1.)
ax.plot(range(len(scores)), scores, 'o', color=color, markersize=3, alpha=1)
ax.axhline(0, ls='--', lw=1, color='lightgray')
ax.set_xlim((-1, len(scores)))
ax.set_ylim((-0.0035, 0.004))
ax.set_xlabel('ID in Bayesian optimisation')
ax.set_ylabel('Variation in Gini after augmentation')
ax.set_title('Augment by $5\%$')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.grid(axis='x')
fig.savefig('../outputs/figures/simulation_tot.pdf', dpi=300, bbox_inches='tight');

In [None]:
tvae_hard = TVAE.load('../outputs/synthesizers/tuning_sim/4.pkl')
tvae_tot = TVAE.load('../outputs/synthesizers/tuning_tot_sim/7.pkl')

def get_score_augment(mod):
    synth = mod.sample(amount)
    X_synth = synth.drop('target', axis=1).values
    y_synth = np.array(synth['target'])
    X_train_aug = np.vstack((X_train, X_synth))
    y_train_aug = np.concatenate((y_train, y_synth))
    return get_xgboost_score(X_train_aug, y_train_aug)

agg_scores_tvae_hard = []
agg_scores_tvae_tot = []
two_perc = len(y_train) // 50
augment_amounts = [two_perc, two_perc * 2, two_perc * 3, two_perc * 4, two_perc * 5]
for amount in augment_amounts:
    scores_tvae_hard = []
    scores_tvae_tot = []
    for _ in tqdm(range(30)): # Repeat n times to mitigate randomness
        scores_tvae_hard.append(get_score_augment(tvae_hard))
        scores_tvae_tot.append(get_score_augment(tvae_tot))
    agg_scores_tvae_hard.append(scores_tvae_hard)
    agg_scores_tvae_tot.append(scores_tvae_tot)

with open('../outputs/results/aug_tvae_hard_sim.pkl', 'wb') as f:
    pickle.dump(agg_scores_tvae_hard, f)
with open('../outputs/results/aug_tvae_tot_sim.pkl', 'wb') as f:
    pickle.dump(agg_scores_tvae_tot, f)

In [None]:
with open('../outputs/results/aug_tvae_hard_sim.pkl', 'rb') as f:
    aug_tvae_hard = np.array(pickle.load(f))
with open('../outputs/results/aug_tvae_tot_sim.pkl', 'rb') as f:
    aug_tvae_tot = np.array(pickle.load(f))
    
means_tvae_hard, stds_tvae_hard = aug_tvae_hard.mean(axis=1), 1.96 * aug_tvae_hard.std(axis=1) / np.sqrt(30)
means_tvae_tot, stds_tvae_tot = aug_tvae_tot.mean(axis=1), 1.96 * aug_tvae_tot.std(axis=1) / np.sqrt(30)

plt.rc('axes', grid=True)
plt.rcParams['grid.color'] = (0.5, 0.5, 0.5, 0.2)

perc_aug = [0, 2, 4, 6, 8, 10]
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
ax.errorbar(perc_aug, [0] + list(means_tvae_hard), yerr=[0] + list(stds_tvae_hard), fmt='-o', capsize=3, markersize=4, alpha=1., label='Best TVAE - Hard',
            color='yellowgreen', ecolor='yellowgreen')
ax.errorbar(perc_aug, [0] + list(means_tvae_tot), yerr=[0] + list(stds_tvae_tot), fmt='-o', capsize=3, markersize=4, alpha=1., label='Best TVAE',
            color='sandybrown', ecolor='sandybrown')
ax.set_xticks(perc_aug)
#ax.set_ylim((0, 0.00045))
ax.set_xlabel('Percentage of data generated')
ax.set_ylabel('Variation in Gini after augmentation')
ax.legend()
fig.savefig('../outputs/figures/simulation_tvae_vs_tvae_hard.pdf', dpi=300, bbox_inches='tight');