In [14]:
import numpy as np
import pandas as pd
import sys
import re
import os
from sklearn.model_selection import LeaveOneOut
from tqdm import tqdm
import scipy.stats as st
import multiprocessing as mp
from scipy.optimize import minimize

automl_path = '../../../oboe/automl/'
sys.path.append(automl_path)

import linalg
import convex_opt

In [19]:
def indloc(indices, ind):
    return np.where(np.array(indices)==ind)[0][0]

loo = LeaveOneOut()

errmtx_df = pd.read_csv(os.path.join(automl_path, 'defaults/error_matrix.csv'), index_col=0, header=0)
errmtx = errmtx_df.values
runtime_df = pd.read_csv(os.path.join(automl_path, 'defaults/runtime_matrix.csv'), index_col=0, header=0)
runtime = runtime_df.values
dataset_sizes_df = pd.read_csv(os.path.join(automl_path, 'defaults/dataset_sizes.csv'), index_col=0, header=0)

ind_errmtx = errmtx_df.index.astype(int)
ind_metafeatures = pd.read_csv('../collect_pmf_performance/metafeatures.csv', index_col=0, header=0).index
ind_common = list(set(ind_errmtx).intersection(set(ind_metafeatures)))

errmtx_common_df = errmtx_df.loc[ind_common]
errmtx_common = errmtx_common_df.values
runtime_common_df = runtime_df.loc[ind_common]
runtime_common = runtime_common_df.values
errmtx_pred = np.zeros(errmtx_common.shape)

In [22]:
#standard PCA to get latent features of datasets and models
X_pca, Y_pca, _ = linalg.pca(errmtx, threshold=0.03)

In [24]:
result_path = 'results'

if not os.path.exists(result_path):
    os.makedirs(result_path)

experimental settings:

In [23]:
initial_rank = 4
final_rank = 40
pick_largest_v_opt = True
scalarization = 'D'
n_init = 5 # number of entries inferred by nearest neighbors

# number of entries constrained, without meta-features

In [26]:
def number_of_entries_solve(N, Y, scalarization='D'):
    n = Y.shape[1]
    # It is observed the scipy.optimize solver in this problem usually converges within 50 iterations. Thus a maximum of 50 step is set as limit.
    if scalarization == 'D':
        def objective(v):
            sign, log_det = np.linalg.slogdet(Y @ np.diag(v) @ Y.T)
            return -1 * sign * log_det
    elif scalarization == 'A':
        def objective(v):
            return np.trace(np.linalg.pinv(Y @ np.diag(v) @ Y.T))
    elif scalarization == 'E':
        def objective(v):
            return np.linalg.norm(np.linalg.pinv(Y @ np.diag(v) @ Y.T), ord=2)
    def constraint(v):
        return N - np.sum(v)
    v0 = np.full((n, ), 0.5)
    constraints = {'type': 'ineq', 'fun': constraint}
    v_opt = minimize(objective, v0, method='SLSQP', bounds=[(0, 1)] * n, options={'maxiter': 50},
                     constraints=constraints)
    return v_opt.x

In [None]:
pick_largest_v_opt = True

regret_all = pd.DataFrame(columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
errors_encountered_all = []

for train_index, test_index in tqdm(loo.split(errmtx_common)):
    
    regret = []
    new_row = np.zeros((1, errmtx_common.shape[1]))

    # true best
    y_best_true = min(errmtx_common[test_index[0], :])
    # predicted best; initialize to be 1 (max achievable error)
    y_best_pred = 1
    for rank in range(initial_rank, final_rank+1):
        v_opt = number_of_entries_solve(rank, Y_pca, scalarization)
        if pick_largest_v_opt:
            to_sample = np.argsort(-v_opt)[:rank]
        else:
            to_sample = np.where(v_opt > 0.9)[0]
        new_row[:, to_sample] = errmtx_common[test_index, to_sample]
        errmtx_pred[test_index, :] = linalg.impute(errmtx_common, new_row, to_sample, rank=rank)
        # predicted best; only update when the new best is better (i.e., has lower error)
        y_best_pred = min(y_best_pred, min(errmtx_common[test_index[0], to_sample]), errmtx_common[test_index[0], np.argmin(errmtx_pred[test_index[0], :])])
        # collect regret
        regret.append(y_best_pred - y_best_true)
    regret_single = pd.DataFrame(np.array(regret).reshape(1, -1), index=[ind_common[test_index[0]]], columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
    regret_all = regret_all.append(regret_single)

save results:

In [22]:
# regret_all.to_csv(os.path.join(result_path, 'regrets_oboe_ed_incremental_number_constrained.csv'), index=True, header=True)

# number of entries constrained, with meta-features

In [None]:
fn_data_feats = 'metafeatures.csv'

metafeatures_df = pd.read_csv(os.path.join('../collect_pmf_performance/', fn_data_feats), index_col=0, header=0)

def init_l1(Ytrain, Ftrain, ftest, n_init=5):

    dis = np.abs(Ftrain - ftest).sum(axis=1)
    ix_closest = np.argsort(dis)[:n_init]
    ix_nonnan_pipelines \
            = np.where(np.invert(np.isnan(Ytrain[:,ix_closest].sum(axis=1))))[0]
    ranks = np.apply_along_axis(st.rankdata, 0,
                                Ytrain[ix_nonnan_pipelines[:,None],ix_closest])
    ave_pipeline_ranks = ranks.mean(axis=1)
    ix_init = ix_nonnan_pipelines[np.argsort(ave_pipeline_ranks)[::-1]]

    return ix_init[:n_init]

regret_all_with_mf = pd.DataFrame(columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
errors_encountered_all = []

for train_index, test_index in tqdm(loo.split(errmtx_common)):
    
    Ftrain = metafeatures_df.loc[errmtx_common_df.index[train_index], :].values
    Ftest = metafeatures_df.loc[errmtx_common_df.index[test_index], :].values
    Ytrain = -errmtx_common_df.loc[errmtx_common_df.index[train_index], :].T.values
    ix_init = init_l1(Ytrain, Ftrain, Ftest,n_init=n_init).tolist()

    regret = []
    new_row = np.zeros((1, errmtx_common.shape[1]))

    # true best
    y_best_true = min(errmtx_common[test_index[0], :])

    # predicted best
    y_best_pred = min(errmtx_common[test_index[0], ix_init])

    for rank in range(initial_rank, final_rank+1):
        print(rank)
        v_opt = number_of_entries_solve(rank-n_init, Y_pca, scalarization)
        if pick_largest_v_opt:
            to_sample = np.argsort(-v_opt)[:(rank-n_init)]
        else:
            to_sample = np.where(v_opt > 0.9)[0]
        to_sample = list(set(to_sample).union(set(ix_init)))
        new_row[:, to_sample] = errmtx_common[test_index, to_sample]
        errmtx_pred[test_index, :] = linalg.impute(errmtx_common, new_row, to_sample, rank=rank)
        # predicted best; only update when the new best is better (i.e., has lower error)
        y_best_pred = min(y_best_pred, min(errmtx_common[test_index[0], to_sample]), errmtx_common[test_index[0], np.argmin(errmtx_pred[test_index[0], :])])
        # collect regret
        regret.append(y_best_pred - y_best_true)
    regret_single = pd.DataFrame(np.array(regret).reshape(1, -1), index=[ind_common[test_index[0]]], columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
    regret_all_with_mf = regret_all_with_mf.append(regret_single)


save results:

In [None]:
# regret_all_with_mf.to_csv(os.path.join(result_path, 'regrets_oboe_ed_incremental_with_mf_number_constrained.csv'), index=True, header=True)