In [None]:
%load_ext autoreload

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ConstantKernel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, OneHotEncoder

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import pickle

In [None]:
import warnings
#warnings.filterwarnings('error', category=UserWarning)
warnings.filterwarnings('ignore')

In [None]:
sns.set_style("whitegrid")

In [None]:
%autoreload
import utils_2019
import mfgpc_opt as mfgpc
from utilities_new import SSMF, MajorClassClassifier, safe_roc_auc_score, get_binary_dataset
import utils_hetmogp

In [None]:
from pmlb import fetch_data

In [None]:
scoring = {'ROCAUC': lambda clf, X, y: safe_roc_auc_score(y, clf.predict_proba(X)[:, 1]), 
           'Accuracy': lambda clf, X, y: accuracy_score(y, clf.predict(X))}

In [None]:
method_modes_to_plot = [('xgb', 'stacking'), ('xgb', 'concatenation'), ('xgb', 'high-fidelity'),
                       ('ss_gpc', 'stacking'), ('ss_gpc', 'concatenation'), ('ss_gpc', 'high-fidelity'),
                       ('ss_logit', 'stacking'), ('ss_logit', 'concatenation'), ('ss_logit', 'high-fidelity'),
                       ('major_vote', 'high-fidelity'), ('ss_mf_gpc', 'multi-fidelity'),
                       ('hetmogp', 'multi-fidelity')]

# artificial

In [None]:
sets_per_dim = 10
dims = np.array([2]*sets_per_dim + [5]*sets_per_dim + [10]*sets_per_dim + [20]*sets_per_dim)

In [None]:
all_dm_df = pd.DataFrame()
for target_noise in [0.2, 0.4]:
    for target_hf in [75]:
        dm_df = pd.DataFrame()
        for ID in range(0, 40):
            sub_dm_df = utils_2019.get_sub_dm_df(
                                'loggers/artifitial_baselines/2019_df_' + str(ID) + '_basic.csv',
                                {'noise':target_noise, 'hf':target_hf}, 
                                method_modes_to_plot)
            sub_dm_df_hetmogp = utils_2019.get_sub_dm_df(
                                'loggers/artifitial_baselines/2019_df_' + str(ID) + '_hetmogp.csv',
                                {'noise':target_noise, 'hf':target_hf}, 
                                method_modes_to_plot)
            sub_dm_df = sub_dm_df.join(sub_dm_df_hetmogp)
            sub_dm_df[('tag', )] = 'artifitial_' + str(dims[ID]) + 'D'
            dm_df = dm_df.append(sub_dm_df, ignore_index=True)
            
        dm_df.columns = [':'.join(x) for x in dm_df.columns.values]

        dmz, accs, num_solvers = utils_2019.make_roc_auc_profile(dm_df[dm_df.columns.drop('tag')])

        print(target_hf, target_noise)
        
#         utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers,
#                              title='artificial datasets \n noise=' + str(target_noise) + ' hf_points=' + str(target_hf),
#                              plot_legend=True)
        pp = PdfPages('figures/artifitial_ROCAUCprofile_hf_'+str(target_hf)+'_noise_'+str(target_noise)+'.pdf')
        utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers, title='', plot_legend=False)
        pp.savefig(bbox_inches='tight')
        pp.close()
        
    t = 1 - dm_df.groupby('tag').mean().transpose()
    t.index = pd.MultiIndex.from_tuples(zip([target_noise]*len(t.index), t.index.values.tolist()))
    all_dm_df = all_dm_df.append(t)

In [None]:
map_methods = {
    'major_vote:high-fidelity':'major vote',
    'ss_gpc:concatenation':'\\concMF{} \\gpc{}',
    'ss_gpc:high-fidelity':'\\gpc{}',
    'ss_gpc:stacking':'\\stackedMF{} \\gpc{}',
    'ss_logit:concatenation':'\\concMF{} \\logit{}',
    'ss_logit:high-fidelity':'\\logit{}',
    'ss_logit:stacking':'\\stackedMF{} \\logit{}',
    'ss_mf_gpc:multi-fidelity': '\\mfgpc{}',
    'xgb:concatenation':'\\concMF{} \\xgb{}',
    'xgb:high-fidelity':'\\xgb{}',
    'xgb:stacking':'\\stackedMF{} \\xgb{}',
    'hetmogp:multi-fidelity':'\\hetmogp{}',
    'GPMA':'\\gpma{}'
}

In [None]:
methods_order = [
    'ss_mf_gpc:multi-fidelity',
    'ss_gpc:high-fidelity',
    'ss_logit:high-fidelity',
    'xgb:high-fidelity',
    'ss_gpc:concatenation',
    'ss_logit:concatenation',
    'xgb:concatenation',
    'ss_gpc:stacking',
    'ss_logit:stacking',
    'xgb:stacking',
    'hetmogp:multi-fidelity',
    'GPMA'
]


In [None]:
margin = 0.01
cols = ['artifitial_2D', 'artifitial_5D', 'artifitial_10D', 'artifitial_20D']
for method in methods_order:
    if method in all_dm_df.index.levels[1]:
        print(map_methods[method] + ' & ', end='')
        row = []
        best_results = []
        for i, t in enumerate(all_dm_df.index.levels[0]):
            r = all_dm_df.loc[(t, method)]
            row.extend(r[cols].values.tolist())
            best_results.extend(all_dm_df.loc[t].max(axis=0)[cols].values.tolist())
        row = np.array(row)
        for i in range(len(row)):
            end = ' & ' if i < len(row) - 1 else ' \\\\\n'
            if row[i] < best_results[i] * (1 - margin):
                print('%.3f'%row[i], end = end)
            else:
                print('$\mathbf{%.3f}$'%row[i], end = end)


# real

In [None]:
aliases = ['diabetes', 'german', 'satimage-1', 'mushroom', 'splice', 'spambase', 'hypothyroid', 'waveform-40']

In [None]:
all_dm_df = pd.DataFrame()
for target_noise in [0.2, 0.4]:
    for target_hf in [75]:
        dm_df = pd.DataFrame()
        for ID in range(len(aliases)):
            sub_dm_df = utils_2019.get_sub_dm_df(
                        'loggers/artifitial_baselines/2019_df_' + aliases[ID] + '_basic.csv',
                        {'noise':target_noise, 'hf':target_hf}, 
                        method_modes_to_plot)
            sub_dm_df_hetmogp = utils_2019.get_sub_dm_df(
                                'loggers/artifitial_baselines/2019_df_' + aliases[ID] + '_hetmogp.csv',
                                {'noise':target_noise, 'hf':target_hf}, 
                                method_modes_to_plot)
            sub_dm_df = sub_dm_df.join(sub_dm_df_hetmogp)
            sub_dm_df[('tag', )] = aliases[ID]
            dm_df = dm_df.append(sub_dm_df, ignore_index=True)
        dm_df.columns = [':'.join(x) for x in dm_df.columns.values]

        dmz, accs, num_solvers = utils_2019.make_roc_auc_profile(dm_df[dm_df.columns.drop('tag')])

        print(target_hf, target_noise)
#         utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers,
#                              title='real datasets \n noise=' + str(target_noise) + ' hf_points=' + str(target_hf),
#                              plot_legend=True)
    
        pp = PdfPages('figures/UCI_ROCAUCprofile_hf_'+str(target_hf)+'_noise_'+str(target_noise)+'.pdf')
        utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers, title='', plot_legend=False)
        pp.savefig(bbox_inches='tight')
        pp.close()
        
        
    t = 1 - dm_df.groupby('tag').mean().transpose()
    t.index = pd.MultiIndex.from_tuples(zip([target_noise]*len(t.index), t.index.values.tolist()))
    all_dm_df = all_dm_df.append(t)

In [None]:
map_real_datasets = {
 'diabetes':'\\diabetes{}',
 'german':'\\german{}',
 'satimage-1':'\\satimage{}',
 'mushroom':'\\mushroom{}',
 'splice':'\\splice{}',
 'spambase':'\\spambase{}',
 'hypothyroid':'\\hypothyroid{}',
 'waveform-40':'\\waveform{}'
}

In [None]:
margin = 0.01

for j, t in enumerate(all_dm_df.index.levels[0]):
    print('\n' + str(t) + '\n')
    best_results = all_dm_df.loc[t].max(axis=0)[aliases].values.tolist()
    for method in methods_order:
        if method in all_dm_df.index.levels[1]:
            print(map_methods[method] + ' & ', end='')
            r = all_dm_df.loc[(t, method)]
            row = np.array(r[aliases].values.tolist())
            for i in range(len(row)):
                end = ' & ' if i < len(row) - 1 else ' \\\\\n'
                if row[i] < best_results[i] * (1 - margin):
                    print('%.3f'%row[i], end = end)
                else:
                    print('$\mathbf{%.3f}$'%row[i], end = end)


# musicgenre

In [None]:
data = pd.read_csv('Datasets/mturk-datasets/music_genre_classification/music_genre_gold.csv', delimiter=',')

In [None]:
class_encoder = {}
classes = data['class'].value_counts()
for i in range(len(classes)):
    class_encoder[classes.index[i]] = i
class_decoder = {v:k for k, v in class_encoder.items()}

In [None]:

dm_df = pd.DataFrame()
for target_genre in class_encoder.values():
    sub_dm_df = utils_2019.get_sub_dm_df(
                    'loggers/artifitial_baselines/2019_df_' + 'musicgenre_' + class_decoder[target_genre] + '.csv',
                    {}, 
                    method_modes_to_plot)
    tmp = pd.read_csv('loggers/artifitial_baselines/df_' + 'musicgenre_' + class_decoder[target_genre] + '_gpma.csv', header=None)
    tmp[1] = tmp[1].apply(lambda x: x.strip())
    tmp = tmp[tmp[1] == 'ROCAUC']
    sub_dm_df[('GPMA', )] =  1 - tmp[2].iloc[len(sub_dm_df)]
    dm_df = dm_df.append(sub_dm_df, ignore_index=True)
    
dm_df.columns = [':'.join(x) for x in dm_df.columns.values]

dmz, accs, num_solvers = utils_2019.make_roc_auc_profile(dm_df)

print(target_hf, target_noise)

# utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers,
#                      title='musicgenre',
#                      plot_legend=True)
pp = PdfPages('figures/musicgenre_ROCAUCprofile_hf_'+str(target_hf)+'.pdf')
utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers, title='', plot_legend=False)
pp.savefig(bbox_inches='tight')
pp.close()

In [None]:
t = 1 - dm_df.mean().transpose()

best_result = t.max()

print('\\musicgenre{}', end=' & ')
for method in methods_order:
    if method in t.index:
        end = ' & '
        if t[method] < best_result * (1 - margin):
            print('%.3f'%t[method], end = end)
        else:
            print('$\mathbf{%.3f}$'%t[method], end = end)

# sentimentpolarity

In [None]:

dm_df = utils_2019.get_sub_dm_df(
    'loggers/artifitial_baselines/2019_df_sentimentpolarity.csv',
    {}, 
    method_modes_to_plot)

In [None]:
tmp = pd.read_csv('loggers/artifitial_baselines/df_sentimentpolarity_gpma.csv', header=None)
tmp[1] = tmp[1].apply(lambda x: x.strip())
tmp = tmp[tmp[1] == 'ROCAUC']
dm_df[('GPMA', )] =  1 - tmp[2].iloc[len(sub_dm_df)]

In [None]:
dm_df.columns = [':'.join(x) for x in dm_df.columns.values]

dmz, accs, num_solvers = utils_2019.make_roc_auc_profile(dm_df)

print(target_hf, target_noise)

# utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers,
#                      title='sentimentpolarity',
#                      plot_legend=True)

pp = PdfPages('figures/polarity_ROCAUCprofile_hf_'+str(target_hf)+'_'+'full'+'D.pdf')
utils_2019.plot_roc_auc_profile(dm_df, dmz, accs, num_solvers, title='', plot_legend=False)
pp.savefig(bbox_inches='tight')
pp.close()

In [None]:
t = 1 - dm_df.mean().transpose()

best_result = t.max()

print('\\sentimentpolarity{}', end=' & ')
for method in methods_order:
    if method in t.index:
        end = ' & '
        if t[method] < best_result * (1 - margin):
            print('%.3f'%t[method], end = end)
        else:
            print('$\mathbf{%.3f}$'%t[method], end = end)

In [None]:
all_full_dfs = pd.DataFrame()
for ID in list(range(0, 40, 5)) + list(range(1, 40, 5)):
#for ID in range(20, 40, 5):
#for ID in range(0, 20, 5):
    full_dfs = pd.read_csv('loggers/artifitial_baselines/2019_df_' + str(ID) + '_budget.csv')
    if ID % 5 == 0:
        full_dfs_additional = pd.read_csv('loggers/artifitial_baselines/2019_df_' + str(ID) + '_budget_additional.csv')
        all_full_dfs = all_full_dfs.append(full_dfs_additional, ignore_index=True)
    all_full_dfs = all_full_dfs.append(full_dfs, ignore_index=True)
    

In [None]:
x = all_full_dfs[['method', 'hf_budget_ratio', 'noise', 'lf_cost', 'ROCAUC']].groupby(['method', 'hf_budget_ratio', 'noise', 'lf_cost']).mean().reset_index()
x.sort_values(by='hf_budget_ratio', inplace=True)

In [None]:
for lf_cost in [1, 2, 4]:
    plt.figure()
    pp = PdfPages('figures/artificial_budget_lf_cost_'+str(lf_cost)+'.pdf')
    for n in [0, 0.2, 0.3, 0.4]:
        sx = x[(x['noise'] == n)&(x['lf_cost'] == lf_cost)]
        
        plt.plot(sx['hf_budget_ratio'].values, sx['ROCAUC'].values, label='noise_level=%.1f' % n)
    pp.savefig(bbox_inches='tight')
    pp.close()
#plt.legend()

In [None]:
gpc_dfs = pd.DataFrame()
for ID in list(range(0, 40, 5)) + list(range(1, 40, 5)):
    gpc_dfs = gpc_dfs.append(pd.read_csv('loggers/artifitial_baselines/2019_df_' + str(ID) + '_budget_gpc.csv'), 
                             ignore_index=True)

gpc_baseline = gpc_dfs[['hf_budget_ratio', 'ROCAUC']].groupby('hf_budget_ratio').mean().reset_index()

gpc_baseline = pd.DataFrame([[0., 0.5]], columns=['hf_budget_ratio', 'ROCAUC']).append(gpc_baseline)

gpc_baseline

In [None]:
linestyles = ['-', '--', ':']
for u, n in enumerate([0, 0.2, 0.3, 0.4]):
    plt.figure()
    pp = PdfPages('figures/artificial_budget_noise_'+str(n).replace('.', '_') +'.pdf')
    for i, lf_cost in enumerate([1, 2, 4]):
        sx = x[(x['noise'] == n)&(x['lf_cost'] == lf_cost)]
        plt.plot(sx['hf_budget_ratio'].values, sx['ROCAUC'].values, label='lf_cost=%.1f' % lf_cost, 
                linestyle=linestyles[i], color='k')#, color='C' + str(u))
        plt.plot(gpc_baseline['hf_budget_ratio'].values, gpc_baseline['ROCAUC'].values, color='r')
    pp.savefig(bbox_inches='tight')
    pp.close()
#plt.legend()