# Learning Pareto sets in high dimensions: How can regularization help?

In [None]:
%load_ext autoreload
%autoreload 2
from multiple_regression import *
from fairness import *
from plotting import *

## Multiple linear regression

### the effect of unlabeled data

In [None]:
errors_matrix = run_multiple_regression_experiment_unlabeled_sparsity(
        random_seed = 42,                                                           # fix random seed
        num_experiments = 10,                                                       # number of experiments to run
        d = 50,                                                                     # dimension
        s_array = np.arange(5,50,5),                                                # sparsity of ground truths
        n = 15,                                                                     # number of labeled samples
        N_array = np.arange(15, 55, 5),                                             # number of unlabeled samples
        noise_var = 0.5,                                                            # variance of the noise
        cov_stabilizer= 0.5,                                                        # stabilizes the condition number of the random covariance matrix A^TA + stab + I -  the smaller, the more adversarial for directly regularized methods
        lambda_fixed = 0.5,                                                         # fixed preference vectors lambda (only first component)
        ts_ensemble_reg_params = np.array(2*[1 for s in np.arange(5,50,5)]),        # regularization strengths for two-stage ensemble
        )

In [None]:
plot_sparsity_unlabeled_matrix(errors_matrix, np.arange(15, 55, 5), np.arange(5,50,5), save=False)

### Pareto fronts

In [None]:
regs = [1,5,7,8,8]
errors_PF, av_cond_number = run_multiple_regression_experiment_PF(
    random_seed = 42,                                                   # fix random seed
    num_experiments = 10,                                               # number of experiments
    d = 100,                                                            # dimension 
    s = 1,                                                              # sparsity level
    n = 20,                                                             # number of labeled datapoints
    N = 200,                                                            # number of unlabeled datapoints
    noise_var = 0.5,                                                    # variance of the additive noise
    cov_stabilizer = 0.2,                                               # the covariances are ranomly selected. The sabilizer ensures a minimum eigenvalue of the covariance matrix.
    lambdas = np.linspace(0,1,10),                                      # preference weights of the ensemble
    dr_ensemble_reg_params = np.array(regs+list(reversed(regs))),       # regularization parameters of the directly regularized ensemble
    dr_hypernetwork_regfun = lambda x: 1+0.5*x*(1-x),                   # regularization parameters of the directly regularized hypernetwork
    ts_ensemble_reg_params = np.array([1,1]),                           # regularization parameters of the two-stage ensemble
    ts_hypernetwork_regfun = lambda x: 0,                               # regularization parameters of the two-stage hypernetwork
    num_epochs = 3000,                                                  # number of training epochs for the hypernetworks
    verbose = False,                                                    # verbosity of training the ensembles
    hn_verbose = False                                                  # verbostiy of training the hypernetworks
)

In [None]:
plot_Pareto_fronts(
    errors_PF, 
    title=' ', 
    save = False
    )

# Fairness datasets

In [None]:
communities = {
    "name":"communities",                           # name of the dataset. one of 'communities', 'adult', 'hsls', 'enem'
    "protected_index": 7,                           # index of protected attribute
    "train_size": 1000,                             # training set size -> split into labeled and unlabeled
    "test_size": 1000,                              # test set size
    "labeled_size": 150,                            # number of labeled training samples. must be <= training_size
    "num_noisy_feats": 0,                           # number of artificial noisy features to add
    "dr_ensemble_reg_params": 10*[0.04],            # regularization parameters for directly regularized ensemble
    "ts_ensemble_reg_params": [0.04,0.04],          # regularization parameters for two-stage ensemble
    "eval_metric": "square_loss"                    # one of 'error_rate' and 'square_loss'
}
adult = {
    "name":"adult",
    "protected_index": 9,
    "train_size": 10_000,
    "test_size": 30_000,
    "labeled_size": 500,
    "num_noisy_feats": 100,
    "dr_ensemble_reg_params": 10*[0.01],
    "ts_ensemble_reg_params": [0.01,0.01],
    "eval_metric": "error_rate"
}
hsls = {
    "name":"hsls",
    "protected_index": 57,
    "train_size": 5000,
    "test_size": 5000,
    "labeled_size": 1000,
    "num_noisy_feats": 0,
    "dr_ensemble_reg_params": 10*[0.01],
    "ts_ensemble_reg_params": [0.01,0.01],
    "eval_metric": "error_rate"
}
enem = {
    "name":"enem",
    "protected_index": 1,
    "train_size": 10_000,
    "test_size": 5_000,
    "labeled_size": 2000,
    "num_noisy_feats": 0,
    "dr_ensemble_reg_params": 10*[0.01],
    "ts_ensemble_reg_params": [0.01,0.01],
    "eval_metric": "error_rate"
}
dataset_list = [communities,adult,hsls,enem]

In [None]:
errors = run_experiments_fairness_PF(
        random_seed = 42,                                   # fix the random seed
        num_experiments = 20,                               # number of experiments
        dataset_list = dataset_list,    # datasets included in the experiment (parameters specified above)
        lambdas = np.linspace(0,1,10),                      # preference weights of the ensembles
        verbose = False                                   # verbosity
    )

In [None]:
plot_fairness_Pareto_fronts(
    errordict = errors, 
    title = 'fairness',
    xlabels = ['square loss on test data','error rate on test data','error rate on test data','error rate on test data','error rate on test data'], 
    save = False, 
    xmax = [0.4,0.23,0.34,0.45],
    xmin = [0,0.2,0.25,0.28],
    ymax = [0.2,0.013,0.0025,0.01]
    )