# Overview

In this notebook, we generate synthetic low-rank outcome data and test different ways of learning the utility model:
- fit indepdendent GPs to each outcome
- learn a PCA decomposition, fit independent GPs to the top principal components that explain most variance
- learn a PCA decomposition, select top PCs that explain the utility, fit independent GPs to them

We look at a few test cases here:
- outcome dimensionality = 10, rank = 1; linear utility function is different from (but not orthogonal to) the outcome axis
- outcome dimensionality = 20, rank = 1; linear utility function points in the same direction as outcome axis
- outcome dimensionality = 20, rank = 2; utility depends on both axes

For each of these test cases, we try the following different embeddings:
- the ground truth low-rank subspace
- the low-rank subspace learned by doing PCA
- the low-rank subspace given by the utility function
- (later) the low=rank subspace learned by doing PCR

Test case 0 helps us test the hypothesis: learning utility-informed embedding is better than utility-agnostic embedding, especially when utility coefficient is not well-aligned with major axes of variation in the outcomes

Test case 1 helps us answer the question: Does PCA do better than Indep? Is it because it's easier to learn the PCA matrix than the independent GP hyperparameters?

Test case 2 helps us compare PCA and PCR. We expect PCR to do better than PCA in this case.

In [2]:
%load_ext autoreload
%autoreload 2

import os, sys
# file_dir = os.path.dirname(__file__)
# sys.path.append(file_dir)
sys.path.append('/home/yz685/low_rank_BOPE')
sys.path.append('/home/yz685/low_rank_BOPE/low_rank_BOPE')
import warnings
import math
import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import torch
from torch import Tensor

from test_problems.synthetic_problem import generate_principal_axes, PCATestProblem
from src.transforms import (
    generate_random_projection,
    InputCenter,
    LinearProjectionInputTransform,
    LinearProjectionOutcomeTransform,
    PCAInputTransform,
    PCAOutcomeTransform,
    SubsetOutcomeTransform,
)
from src.pref_learning_helpers import gen_initial_real_data, fit_pref_model
from src.diagnostics import check_util_model_fit
from src.models import make_modified_kernel

# import botorch, gpytorch functions
from botorch import fit_gpytorch_model, fit_gpytorch_mll
from botorch.optim.fit import fit_gpytorch_scipy
from botorch.optim.utils import _filter_kwargs
from botorch.utils.sampling import draw_sobol_samples
from botorch.models import SingleTaskGP
from botorch.models.transforms.outcome import ChainedOutcomeTransform, Standardize
from botorch.models.transforms.input import (
    ChainedInputTransform,
    FilterFeatures,
    Normalize,
)

from gpytorch.kernels import MaternKernel
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.priors import GammaPrior

warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tkwargs = {
    "dtype": torch.double,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

# Helper functions and classes

In [4]:
def make_problem(**kwargs):
    """
    Create test problem with specified low-rank structure.
    """

    # default config
    config = {
        "input_dim": 1,
        # "outcome_dim": 20,
        "latent_dim": 1,
        "PC_noise_level": 0,
        "noise_std": 0.1,
        "num_initial_samples": 20,
        "num_sample_points": 30,
        # "ground_truth_principal_axes": torch.Tensor([1]*20),
        "PC_lengthscales": [0.1],
        "PC_scaling_factors": [2],
        "variance_explained_threshold": 0.99,
    }

    # overwrite config settings with kwargs
    for key, val in kwargs.items():
        config[key] = val

    torch.manual_seed(1234)
    initial_X = torch.randn((config["num_initial_samples"], config["input_dim"]), **tkwargs)

    obj_indices = list(range(config["outcome_dim"]))
    cons_indices = []

    if len(config['ground_truth_principal_axes'].shape) == 1:
        config['ground_truth_principal_axes'] = config['ground_truth_principal_axes'].unsqueeze(0)

    problem = PCATestProblem(
        opt_config=(obj_indices, cons_indices),
        initial_X=initial_X,
        bounds=torch.Tensor([[0, 1]] * config["input_dim"]),
        ground_truth_principal_axes=config['ground_truth_principal_axes'],
        noise_std=config["noise_std"],
        PC_lengthscales=Tensor(config["PC_lengthscales"]),
        PC_scaling_factors=Tensor(config["PC_scaling_factors"]),
        dtype=torch.double,
    )

    return problem

In [5]:
class LinearUtil(torch.nn.Module):
    """ 
    Create linear utility function modulew with specified coefficient beta.
    f(y) = beta_1 * y_1 + ... + beta_k * y_k
    """
    def __init__(self, beta: torch.Tensor):
        """
        Args:
            beta: size `output_dim` tensor
        """
        super().__init__()
        self.register_buffer("beta", beta)

    def calc_raw_util_per_dim(self, Y):
        return Y * self.beta.to(Y)

    def forward(self, Y, X=None):
        return Y @ self.beta.to(Y)

class SumOfSquaresUtil(torch.nn.Module):
    """ 
    Create sum of squares utility function modulew with specified coefficient beta.
    f(y) = beta_1 * y_1^2 + ... + beta_k * y_k^2
    """
    def __init__(self, beta: torch.Tensor):
        """
        Args:
            beta: size `output_dim` tensor
        """
        super().__init__()
        self.register_buffer("beta", beta)

    def calc_raw_util_per_dim(self, Y):
        return torch.square(Y) * self.beta.to(Y)

    def forward(self, Y, X=None):
        return torch.square(Y) @ self.beta.to(Y)

In [6]:
def RMSE_helper(num_test_points, models_dict, outcome_idxs):
    """ 
    (Not used in this NB) Compute root mean squared error of GP posterior predictions.
    """

    test_X = torch.linspace(0, 1, num_test_points).unsqueeze(1).to(**tkwargs)
    test_Y = problem.eval_metrics_true(test_X).detach()

    for outcome_idx in outcome_idxs:

        for model_name, model in models_dict.items():

            test_Y_posterior_mean = model.posterior(test_X).mean[:, outcome_idx]
           
            print(
                model_name, 
                'outcome', outcome_idx, 
                'RMSE', torch.sqrt(torch.mean(
                    torch.square(test_Y_posterior_mean - test_Y[:,outcome_idx])
                )).item()
            )



In [7]:
def fit_util_models(train_Y, comps, util_vals, input_transform, covar_module):
    """ 
    Fit utility model given (1) comparisons (2) ground truth utility values.
    Return the two fitted GPs.
    (In practice, fitting model (2) is usually not feasible.)
    """
    util_model_rel = fit_pref_model(
        train_Y, 
        comps, 
        input_transform = input_transform, 
        covar_module = covar_module
    )
    util_model_abs = SingleTaskGP(
        train_Y, 
        util_vals.unsqueeze(1), 
        input_transform = input_transform, 
        covar_module = covar_module
    )
    mll = ExactMarginalLogLikelihood(util_model_abs.likelihood, util_model_abs)
    fit_gpytorch_mll(mll)

    return util_model_rel, util_model_abs

def fit_models_helper(
    train_Y, comps, util_vals, method, axes_dict = None, 
    modify_kernel = False, a=0.2, b=5
):
    """ 
    Fit utility models for different methods (st, pca, pcr) and potentially a 
    set of different axes in `axes_dict`. If specified, also modify the hyperpriors 
    of the input covar_module based on the supplied parameter value `a` and `b`.
    Return the fitted models in a dictionary. The suffix '_rel' means the model
    is fit on pairwise comparisons; the suffix '_abs' means the model is fit on 
    ground truth utility values.
    """
    input_transform = None
    models_dict = {}
    if method in ("pca", "pcr"):
        for axes_label, axes in axes_dict.items():
            latent_dim = axes.shape[0]
            input_transform = ChainedInputTransform(
                        **{
                            "center": InputCenter(train_Y.shape[-1]),
                            "pca": PCAInputTransform(axes.to(torch.double)),
                        }
                    )
            covar_module = make_modified_kernel(
                ard_num_dims=latent_dim, a=a, b=b) if modify_kernel else None

            util_model_rel, util_model_abs = fit_util_models(
                train_Y, comps, util_vals, input_transform, covar_module)
            models_dict[method+'_'+axes_label+'_rel'] = util_model_rel
            models_dict[method+'_'+axes_label+'_abs'] = util_model_abs
    
    elif method == "st":
        covar_module = make_modified_kernel(ard_num_dims=train_Y.shape[-1]) if modify_kernel else None
        input_transform = None
        util_model_rel, util_model_abs = fit_util_models(
                train_Y, comps, util_vals, input_transform, covar_module)
        models_dict[method+'_rel'] = util_model_rel
        models_dict[method+'_abs'] = util_model_abs
    
    return models_dict

In [9]:
def check_util_model_fit_wrapper(problem, util_func, models_dict, seed = 0, n_test = 1000):
    """ 
    Check the accuracy of preference prediction of the models in `models_dict` 
    on a separate test set. Return the accuracy in a dictionary. 
    """
    torch.manual_seed(seed)
    acc_dict = {}
    for model_key, model in models_dict.items():
        print(f'checking fit of {model_key}')
        acc = check_util_model_fit(
            pref_model = model,
            problem = problem,
            util_func = util_func,
            n_test = n_test,
            batch_eval = True,
            return_util_vals = False
        )
        acc_dict[model_key] = acc
    
    return acc_dict

# Test case 0: outcome dimensionality = 10, rank = 1, axis = (1,0,...,0), linear utility coeff = (1,1,0,...,0)

In [243]:
ground_truth_principal_axes = torch.Tensor([1]+[0]*9)

problem = make_problem(
    outcome_dim = 10,
    ground_truth_principal_axes = ground_truth_principal_axes
)

torch.manual_seed(123)
beta = torch.tensor([1,1]+[0]*8, **tkwargs)
util_func = LinearUtil(beta=beta)

train_X, train_Y, util_vals, comps = gen_initial_real_data(n=100, problem=problem, util_func=util_func)
print('train_X, train_Y shape: ', train_X.shape, train_Y.shape)


train_X, train_Y shape:  torch.Size([100, 1]) torch.Size([100, 10])


In [244]:
st_model = SingleTaskGP(
    train_X,
    train_Y,
    covar_module = MaternKernel(lengthscale_prior = GammaPrior(3,6))
)
st_mll = ExactMarginalLogLikelihood(st_model.likelihood, st_model)
fit_gpytorch_model(st_mll)

pca_model = SingleTaskGP(
    train_X,
    train_Y,
    outcome_transform=ChainedOutcomeTransform(
        **{
            "standardize": Standardize(10, min_stdv=100),
            "pca": PCAOutcomeTransform(num_axes=1),
        }
    ),
    likelihood=GaussianLikelihood(noise_prior=GammaPrior(0.9, 10)),
)
pca_mll = ExactMarginalLogLikelihood(pca_model.likelihood, pca_model)

fit_gpytorch_mll(pca_mll)

print(pca_model.outcome_transform['pca'].axes_learned)

tensor([[ 9.9980e-01, -2.5584e-03, -6.8475e-03, -2.4620e-03, -6.7253e-03,
         -7.7932e-04, -2.3186e-03, -4.3601e-03,  1.6223e-02, -1.4621e-03]],
       dtype=torch.float64)


In [256]:
# create dict of axes

pca_axes_dict = {
    "learned": pca_model.outcome_transform['pca'].axes_learned,
    "true": ground_truth_principal_axes.unsqueeze(0),
    "oracle": beta.unsqueeze(0) # utility coefficient
}

print(pca_axes_dict)

{'learned': tensor([[ 9.9980e-01, -2.5584e-03, -6.8475e-03, -2.4620e-03, -6.7253e-03,
         -7.7932e-04, -2.3186e-03, -4.3601e-03,  1.6223e-02, -1.4621e-03]],
       dtype=torch.float64), 'true': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), 'oracle': tensor([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=torch.float64)}


In [254]:
pca_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="pca", 
    axes_dict = pca_axes_dict, 
    modify_kernel = True
)

print(pca_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, pca_models_dict)

dict_keys(['pca_learned_rel', 'pca_learned_abs', 'pca_true_rel', 'pca_true_abs', 'pca_oracle_rel', 'pca_oracle_abs'])
checking fit of pca_learned_rel
checking fit of pca_learned_abs
checking fit of pca_true_rel
checking fit of pca_true_abs
checking fit of pca_oracle_rel
checking fit of pca_oracle_abs


{'pca_learned_rel': 0.9800000190734863,
 'pca_learned_abs': 0.9860000014305115,
 'pca_true_rel': 0.972000002861023,
 'pca_true_abs': 0.9760000109672546,
 'pca_oracle_rel': 1.0,
 'pca_oracle_abs': 1.0}

In [255]:
st_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="st"
)

print(st_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, st_models_dict)

dict_keys(['st_rel', 'st_abs'])
checking fit of st_rel
checking fit of st_abs


{'st_rel': 0.9380000233650208, 'st_abs': 0.9980000257492065}

# Test case 1: outcome dimensionality = 20, rank = 1, axis = linear utility coeff = (1,1,...,1)

In [40]:
ground_truth_principal_axes = torch.Tensor([1]*20)

problem = make_problem(
    outcome_dim = 20,
    ground_truth_principal_axes = ground_truth_principal_axes,
)

torch.manual_seed(123)
beta = torch.tensor([1]*20, **tkwargs)
util_func = LinearUtil(beta=beta)

train_X, train_Y, util_vals, comps = gen_initial_real_data(n=150, problem=problem, util_func=util_func)


In [41]:
st_model = SingleTaskGP(
    train_X,
    train_Y,
    covar_module = MaternKernel(lengthscale_prior = GammaPrior(3,6))
)
st_mll = ExactMarginalLogLikelihood(st_model.likelihood, st_model)
fit_gpytorch_model(st_mll)

ExactMarginalLogLikelihood(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (noise_prior): GammaPrior()
      (raw_noise_constraint): GreaterThan(1.000E-04)
    )
  )
  (model): SingleTaskGP(
    (likelihood): GaussianLikelihood(
      (noise_covar): HomoskedasticNoise(
        (noise_prior): GammaPrior()
        (raw_noise_constraint): GreaterThan(1.000E-04)
      )
    )
    (mean_module): ConstantMean()
    (covar_module): MaternKernel(
      (lengthscale_prior): GammaPrior()
      (raw_lengthscale_constraint): Positive()
    )
  )
)

In [42]:
pca_model = SingleTaskGP(
    train_X,
    train_Y,
    outcome_transform=ChainedOutcomeTransform(
        **{
            "standardize": Standardize(20, min_stdv=100),
            "pca": PCAOutcomeTransform(num_axes=1),
        }
    ),
    likelihood=GaussianLikelihood(noise_prior=GammaPrior(0.9, 10)),
)
pca_mll = ExactMarginalLogLikelihood(pca_model.likelihood, pca_model)

fit_gpytorch_mll(pca_mll)

print(pca_model.outcome_transform['pca'].axes_learned)

tensor([[0.2245, 0.2223, 0.2241, 0.2232, 0.2243, 0.2223, 0.2223, 0.2232, 0.2235,
         0.2245, 0.2240, 0.2218, 0.2223, 0.2246, 0.2233, 0.2239, 0.2233, 0.2234,
         0.2250, 0.2264]], dtype=torch.float64)


In [43]:
pca_axes_dict = {
    "learned": pca_model.outcome_transform['pca'].axes_learned,
    "true": ground_truth_principal_axes.unsqueeze(0),
    "oracle": beta.unsqueeze(0) # utility coefficient
}
# TODO: later, run regression and add PCR_axes_dict for the rank-2 test case

In [45]:
pca_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="pca", 
    axes_dict = pca_axes_dict, 
    modify_kernel = True
)

print(pca_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, pca_models_dict, n_test=30)

dict_keys(['pca_learned_rel', 'pca_learned_abs', 'pca_true_rel', 'pca_true_abs', 'pca_oracle_rel', 'pca_oracle_abs'])
checking fit of pca_learned_rel
checking fit of pca_learned_abs
checking fit of pca_true_rel
checking fit of pca_true_abs
checking fit of pca_oracle_rel
checking fit of pca_oracle_abs


{'pca_learned_rel': 1.0,
 'pca_learned_abs': 0.8666666746139526,
 'pca_true_rel': 0.8666666746139526,
 'pca_true_abs': 1.0,
 'pca_oracle_rel': 1.0,
 'pca_oracle_abs': 0.8666666746139526}

In [46]:
st_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="st"
)

print(st_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, st_models_dict)

dict_keys(['st_rel', 'st_abs'])
checking fit of st_rel
checking fit of st_abs


{'st_rel': 0.9300000071525574, 'st_abs': 0.9959999918937683}

In [None]:
# Next TODO:

# look at nonlinear utility
# math derivation
# PCR

# Test case 2: outcome dimensionality = 20, rank = 2, axis_1 = [1]x10+[0]x10, axis_2 = [0]x10+[1]x10, linear util coeff = [1]x20

In [47]:
ground_truth_principal_axes = torch.Tensor([[1]*10+[0]*10, [0]*10+[1]*10])

problem = make_problem(
    outcome_dim = 20,
    ground_truth_principal_axes = ground_truth_principal_axes,
    noise_std = 1,
    PC_lengthscales = [0.1, 0.1],
    PC_scaling_factors = [2, 0.2]
)

torch.manual_seed(123)
beta = torch.tensor([1]*20, **tkwargs)
util_func = LinearUtil(beta=beta)

train_X, train_Y, util_vals, comps = gen_initial_real_data(n=200, problem=problem, util_func=util_func)

In [48]:
st_model_low = SingleTaskGP(
    train_X,
    train_Y,
    covar_module = MaternKernel(lengthscale_prior = GammaPrior(3,6))
)
st_mll_low = ExactMarginalLogLikelihood(st_model_low.likelihood, st_model_low)
fit_gpytorch_mll(st_mll_low)

ExactMarginalLogLikelihood(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (noise_prior): GammaPrior()
      (raw_noise_constraint): GreaterThan(1.000E-04)
    )
  )
  (model): SingleTaskGP(
    (likelihood): GaussianLikelihood(
      (noise_covar): HomoskedasticNoise(
        (noise_prior): GammaPrior()
        (raw_noise_constraint): GreaterThan(1.000E-04)
      )
    )
    (mean_module): ConstantMean()
    (covar_module): MaternKernel(
      (lengthscale_prior): GammaPrior()
      (raw_lengthscale_constraint): Positive()
    )
  )
)

In [49]:
options = {"maxiter": 1000}

pca_model = SingleTaskGP(
    train_X,
    train_Y,
    outcome_transform=ChainedOutcomeTransform(
        **{
            "standardize": Standardize(20, min_stdv=100),
            "pca": PCAOutcomeTransform(num_axes=1),
        }
    ),
    likelihood=GaussianLikelihood(noise_prior=GammaPrior(0.9, 10)),
)
pca_mll = ExactMarginalLogLikelihood(pca_model.likelihood, pca_model)

fit_gpytorch_mll(pca_mll)

print(pca_model.outcome_transform['pca'].axes_learned)

tensor([[ 0.3200,  0.3147,  0.3040,  0.2990,  0.3065,  0.3177,  0.3301,  0.3432,
          0.3067,  0.3132,  0.0366,  0.0190,  0.0253,  0.0167,  0.0158,  0.0032,
         -0.0013, -0.0071,  0.0004,  0.0122]], dtype=torch.float64)


In [50]:
pca_axes_dict = {
    "learned": pca_model.outcome_transform['pca'].axes_learned,
    "true": ground_truth_principal_axes,
    "oracle": beta.unsqueeze(0) # utility coefficient
}

In [51]:
pca_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="pca", 
    axes_dict = pca_axes_dict, 
    modify_kernel = True
)

print(pca_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, pca_models_dict, n_test = 100)

# nonPSD because you'd get the same util for data points that differ only in the outcome dimensions that don't matter

dict_keys(['pca_learned_rel', 'pca_learned_abs', 'pca_true_rel', 'pca_true_abs', 'pca_oracle_rel', 'pca_oracle_abs'])
checking fit of pca_learned_rel
checking fit of pca_learned_abs
checking fit of pca_true_rel
checking fit of pca_true_abs
checking fit of pca_oracle_rel
checking fit of pca_oracle_abs


{'pca_learned_rel': 0.9399999976158142,
 'pca_learned_abs': 0.9599999785423279,
 'pca_true_rel': 0.9599999785423279,
 'pca_true_abs': 1.0,
 'pca_oracle_rel': 1.0,
 'pca_oracle_abs': 1.0}

In [52]:
st_models_dict = fit_models_helper(
    train_Y, comps, util_vals, 
    method="st"
)

print(st_models_dict.keys())

check_util_model_fit_wrapper(problem, util_func, st_models_dict)

dict_keys(['st_rel', 'st_abs'])
checking fit of st_rel
checking fit of st_abs


{'st_rel': 0.6000000238418579, 'st_abs': 0.765999972820282}