In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ConstantKernel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, OneHotEncoder

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib.backends.backend_pdf import PdfPages
import pickle

In [None]:
import warnings
#warnings.filterwarnings('error', category=UserWarning)
warnings.filterwarnings('ignore')

In [None]:
sns.set_style("whitegrid")

In [None]:
%load_ext autoreload

In [None]:
%autoreload
import utils_2019
import mfgpc_opt as mfgpc
from utilities_new import SSMF, MajorClassClassifier, safe_roc_auc_score, get_binary_dataset
import utils_hetmogp

In [None]:
from pmlb import fetch_data

In [None]:
scoring = {'ROCAUC': lambda clf, X, y: safe_roc_auc_score(y, clf.predict_proba(X)[:, 1]), 
           'Accuracy': lambda clf, X, y: accuracy_score(y, clf.predict(X))}

In [None]:
kernel = ConstantKernel(1, constant_value_bounds=(0.1, 10.0)) * RBF(1, length_scale_bounds=(0.01, 10))
mf_gpc = mfgpc.MultiFidelityGaussianProcessClassifier(kernel = kernel, rho = 0.0, n_restarts_optimizer = 10, eval_gradient=True)

In [None]:
methods_mf = {}
methods_mf['ss_mf_gpc'] = SSMF(mf_gpc)

In [None]:
methods_sf = {}
methods_sf['ss_gpc'] = make_pipeline(StandardScaler(), GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10))

In [None]:
import time

In [None]:
BUDGET = 800

In [None]:
for lf_cost, hf_cost in [(1, 8), (2, 8), (4, 8)]:
    for hf_budget_ratio in [0., 0.2, 0.4, 0.6, 0.8, 1.0]:
        hf = int(np.round(BUDGET * hf_budget_ratio / hf_cost))
        lf = int(np.round(BUDGET * (1 - hf_budget_ratio) / lf_cost))
        print(hf_cost, lf_cost, hf_budget_ratio, hf, lf)

In [None]:
start_time = time.time()
#for ID in range(0, 40, 5):
for ID in range(1, 40, 5):
    artdf = pd.read_csv('Datasets/artifitial/df_' + str(ID) + '.csv')
    X = artdf[list(filter(lambda x: x.find('feature') != -1, artdf.columns))].values
    y_gold = artdf['target_gold'].values
    full_dfs = []
    for c in [0.0, 0.2, 0.3, 0.4]:
        y_corrupted = artdf['taget_noisy_' + str(c)].values
        hf_dfs = []
        for lf_cost, hf_cost in [(1, 8), (2, 8), (4, 8)]:
            for hf_budget_ratio in [0., 0.2, 0.4, 0.6, 0.8, 1.0]:
                hf = int(BUDGET * hf_budget_ratio / hf_cost)
                lf = int(BUDGET * (1 - hf_budget_ratio) / lf_cost)
                
                print(ID, c, lf_cost, hf_cost, hf_budget_ratio, hf, lf)
                kwargs = {
                  'X':X, 
                  'y_lf':y_corrupted, 
                  'y_hf':y_gold, 
                  'y_groundtruth':y_gold, 
                  'scoring':scoring,
                  'test_size':2500, 
                  'train_lf_size':lf, 
                  'train_hf_size':hf, 
                  'runs':3, 
                  'verbose':True
                }
                
                if hf_budget_ratio == 0:
                    kwargs['train_hf_size'] = lf
                    kwargs['y_hf'] = y_corrupted # low fidelity instead of high fidelity
                    
                if hf_budget_ratio == 0 or hf_budget_ratio == 1:
                    kwargs['modes'] = ['high-fidelity']
                    df = utils_2019.run_tests_all_clfs(methods_sf, **kwargs)
                else:
                    kwargs['modes'] = ['stacking']
                    df = utils_2019.run_tests_all_clfs(methods_mf, **kwargs)
                
                #assert False

                df['hf_cost'] = hf_cost
                df['lf_cost'] = lf_cost
                df['hf_budget_ratio'] = hf_budget_ratio
                df['hf'] = hf
                df['lf'] = lf
                
                hf_dfs.append(df)
        dfs = pd.concat(hf_dfs, ignore_index=True)
        dfs['noise'] = c
        full_dfs.append(dfs)
    full_dfs = pd.concat(full_dfs, ignore_index=True)
    full_dfs.to_csv('loggers/artifitial_baselines/2019_df_' + str(ID) + '_budget.csv', index=False)
    print(ID, ('%.1f' % ((time.time() - start_time)/60)) + ' min passed')
    #break