In [None]:
oboe_automl_path = '../../../oboe/automl'
metafeature_folder_path = 'selected_OpenML_dataset_metafeatures'

In [None]:
import numpy as np
import pandas as pd
import sys
import re
import os
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut

In [None]:
sys.path.append(oboe_automl_path)
import linalg

In [None]:
# indices of datasets on which metafeatures have been calculated
ind_metafeatures = [int(re.findall("\\d+", file)[0]) for file in os.listdir(metafeature_folder_path) if file.endswith('.csv') and 'sizes' not in file]

In [None]:
errmtx_df = pd.read_csv(os.path.join(oboe_automl_path, 'defaults/error_matrix.csv'), index_col=0, header=0)
errmtx = errmtx_df.values
ind_errmtx = errmtx_df.index.astype(int)

In [None]:
approximate_rank = linalg.approx_rank(errmtx, threshold=0.01)

In [None]:
ind_common = list(set(ind_errmtx).intersection(set(ind_metafeatures)))

In [None]:
cold_start_mapping = RandomForestRegressor()

In [None]:
X = np.vstack((pd.read_csv(os.path.join(metafeature_folder_path, 'metafeatures_{}.csv'.format(ind)), index_col=0, header=0).values for ind in ind_common))

In [None]:
def indloc(indices, ind):
    return np.where(np.array(indices)==ind)[0][0]

In [None]:
loo = LeaveOneOut()

In [None]:
centering = False #whether to center the vectors

if centering:
    errmtx_in_use = errmtx - np.dot(np.ones(errmtx.shape[0]).reshape(-1, 1), np.average(errmtx, axis=0).reshape(1, -1))
else:
    errmtx_in_use = errmtx    
    
X_pca, Y_pca, _ = linalg.pca(errmtx_in_use, threshold=0.01)
U = np.vstack((X_pca.T[indloc(ind_errmtx, ind), :] for ind in ind_common))
U_pred = np.zeros(U.shape)
# leave-one-out cross-validation across datasets
for train_index, test_index in tqdm(loo.split(X)):
    cold_start_mapping.fit(X[train_index, :], U[train_index, :])
    U_pred[test_index, :] = cold_start_mapping.predict(X[test_index, :])[0]

errmtx_pred = np.dot(U_pred, Y_pca)
errmtx_common = np.vstack((errmtx[indloc(ind_errmtx, ind), :] for ind in ind_common))
percentage_of_error = np.linalg.norm(errmtx_pred - errmtx_common, axis=1) / np.linalg.norm(errmtx_common, axis=1)


In [None]:
pd.DataFrame(ind_common, index=None, columns=None).to_csv("ind_alors.csv", index=None, header=None)

if centering:
    pd.DataFrame(errmtx_common, index=None, columns=None).to_csv("errmtx_common_alors_centering.csv", index=None, header=None)
    pd.DataFrame(errmtx_pred, index=None, columns=None).to_csv("errmtx_pred_alors_centering.csv", index=None, header=None)
    pd.DataFrame(percentage_of_error, index=None, columns=None).to_csv("percentage_of_error_alors_centering.csv", index=None, header=None)
else:
    pd.DataFrame(errmtx_common, index=None, columns=None).to_csv("errmtx_common_alors_non_centering.csv", index=None, header=None)
    pd.DataFrame(errmtx_pred, index=None, columns=None).to_csv("errmtx_pred_alors_non_centering.csv", index=None, header=None)
    pd.DataFrame(percentage_of_error, index=None, columns=None).to_csv("percentage_of_error_alors_non_centering.csv", index=None, header=None)

In [None]:
np.average(percentage_of_error)

In [None]:
np.std(percentage_of_error)

In [None]:
ind_actual_best_five = np.argsort(np.argsort(errmtx_common, axis=1), axis=1)[:, :5]

ind_pred_best_five = np.argsort(np.argsort(errmtx_pred, axis=1), axis=1)[:, :5]

ind_overlap = [set(ind_actual_best_five[i, :]).intersection(set(ind_pred_best_five[i, :])) for i in range(ind_pred_best_five.shape[0])]

average_accuracy = np.average([len(item) for item in ind_overlap])

In [None]:
# average (across datasets in LOOCV) prediction accuracy
average_accuracy