In [7]:
import numpy as np
import pandas as pd
import sys
import re
import os
from sklearn.model_selection import LeaveOneOut
from tqdm import tqdm
import scipy.stats as st
import multiprocessing as mp
from scipy.optimize import minimize

automl_path = '../../../oboe/automl/'
sys.path.append(automl_path)

import linalg
import convex_opt

In [8]:
def indloc(indices, ind):
    return np.where(np.array(indices)==ind)[0][0]

loo = LeaveOneOut()

errmtx_df = pd.read_csv(os.path.join(automl_path, 'defaults/error_matrix.csv'), index_col=0, header=0)
errmtx = errmtx_df.values
runtime_df = pd.read_csv(os.path.join(automl_path, 'defaults/runtime_matrix.csv'), index_col=0, header=0)
runtime = runtime_df.values
dataset_sizes_df = pd.read_csv(os.path.join(automl_path, 'defaults/dataset_sizes.csv'), index_col=0, header=0)

ind_errmtx = errmtx_df.index.astype(int)
ind_metafeatures = pd.read_csv('../collect_pmf_performance/metafeatures.csv', index_col=0, header=0).index
ind_common = list(set(ind_errmtx).intersection(set(ind_metafeatures)))

errmtx_common_df = errmtx_df.loc[ind_common]
errmtx_common = errmtx_common_df.values
runtime_common_df = runtime_df.loc[ind_common]
runtime_common = runtime_common_df.values
errmtx_pred = np.zeros(errmtx_common.shape)

In [9]:
#standard PCA to get latent features of datasets and models
X_pca, Y_pca, _ = linalg.pca(errmtx, threshold=0.03)

In [10]:
result_path = 'results'

if not os.path.exists(result_path):
    os.makedirs(result_path)

experimental settings:

In [11]:
initial_rank = 4
final_rank = 40
pick_largest_v_opt = True
scalarization = 'D'
n_init = 5 # number of entries inferred by nearest neighbors

# without meta-features

In [None]:
regret_all = pd.DataFrame(columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
errors_encountered_all = []

for train_index, test_index in tqdm(loo.split(errmtx_common)):
    
    regret = []
    new_row = np.zeros((1, errmtx_common.shape[1]))
    # true best
    y_best_true = min(errmtx_common[test_index[0], :])
    # predicted best; initialize to be 1 (max achievable error)
    y_best_pred = 1        
    for rank in range(initial_rank, final_rank+1):
        to_sample = linalg.pivot_columns(errmtx_common, rank=rank)
        new_row[:, to_sample] = errmtx_common[test_index, to_sample]
        errmtx_pred[test_index, :] = linalg.impute(errmtx_common, new_row, to_sample, rank=rank)
        # predicted best
        y_best_pred = min(y_best_pred, min(errmtx_common[test_index[0], to_sample]), errmtx_common[test_index[0], np.argmin(errmtx_pred[test_index[0], :])])
        # collect regret
        regret.append(y_best_pred - y_best_true)
    regret_single = pd.DataFrame(np.array(regret).reshape(1, -1), index=[ind_common[test_index[0]]], columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
    regret_all = regret_all.append(regret_single)

save results:

In [14]:
regret_all.to_csv(os.path.join(result_path, 'regrets_oboe_qr_incremental.csv'), index=True, header=True)

# with metafeatures

In [13]:
fn_data_feats = 'metafeatures.csv'

metafeatures_df = pd.read_csv(os.path.join('../collect_pmf_performance/', fn_data_feats), index_col=0, header=0)

In [14]:
def init_l1(Ytrain, Ftrain, ftest, n_init=5):

    dis = np.abs(Ftrain - ftest).sum(axis=1)
    ix_closest = np.argsort(dis)[:n_init]
    ix_nonnan_pipelines \
            = np.where(np.invert(np.isnan(Ytrain[:,ix_closest].sum(axis=1))))[0]
    ranks = np.apply_along_axis(st.rankdata, 0,
                                Ytrain[ix_nonnan_pipelines[:,None],ix_closest])
    ave_pipeline_ranks = ranks.mean(axis=1)
    ix_init = ix_nonnan_pipelines[np.argsort(ave_pipeline_ranks)[::-1]]

    return ix_init[:n_init]

In [None]:
regret_all_with_mf = pd.DataFrame(columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
errors_encountered_all = []
n_init = 5
initial_rank = 5
final_rank = 40

for train_index, test_index in tqdm(loo.split(errmtx_common)):
    try:
        regret = []
        new_row = np.zeros((1, errmtx_common.shape[1]))
        # true best
        y_best_true = min(errmtx_common[test_index[0], :])        
        
        Ftrain = metafeatures_df.loc[errmtx_common_df.index[train_index], :].values
        Ftest = metafeatures_df.loc[errmtx_common_df.index[test_index], :].values
        Ytrain = -errmtx_common_df.loc[errmtx_common_df.index[train_index], :].T.values
        ix_init = init_l1(Ytrain, Ftrain, Ftest,n_init=n_init).tolist()
        # predicted best
        y_best_pred = min(errmtx_common[test_index[0], ix_init])
        
        for rank in range(initial_rank, final_rank+1):            
            to_sample = list(set(linalg.pivot_columns(errmtx_common, rank=rank-n_init)).union(set(ix_init)))
            new_row[:, to_sample] = errmtx_common[test_index, to_sample]
            errmtx_pred[test_index, :] = linalg.impute(errmtx_common, new_row, to_sample, rank=rank)
            # predicted best
            y_best_pred = min(y_best_pred, min(errmtx_common[test_index[0], to_sample]), errmtx_common[test_index[0], np.argmin(errmtx_pred[test_index[0], :])])
            # collect regret
            regret.append(y_best_pred - y_best_true)
        regret_single = pd.DataFrame(np.array(regret).reshape(1, -1), index=[ind_common[test_index[0]]], columns=['rank {}'.format(rank) for rank in range(initial_rank, final_rank+1)])
        regret_all_with_mf = regret_all_with_mf.append(regret_single)
    except:
        print("error encountered on dataset {}".format(ind_common[test_index[0]]))
        errors_encountered_all.append(ind_common[test_index[0]])

save results:

In [149]:
regret_all_with_mf.to_csv(os.path.join(result_path, 'regrets_oboe_qr_incremental_with_mf.csv'), index=True, header=True)