In [None]:
%load_ext autoreload

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ConstantKernel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, OneHotEncoder

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import pickle

In [None]:
import warnings
#warnings.filterwarnings('error', category=UserWarning)
warnings.filterwarnings('ignore')

In [None]:
sns.set_style("whitegrid")

In [None]:
%autoreload
import utils_2019
import mfgpc_opt as mfgpc
from utilities_new import SSMF, MajorClassClassifier, safe_roc_auc_score, get_binary_dataset
import utils_hetmogp

In [None]:
from pmlb import fetch_data

In [None]:
scoring = {'ROCAUC': lambda clf, X, y: safe_roc_auc_score(y, clf.predict_proba(X)[:, 1]), 
           'Accuracy': lambda clf, X, y: accuracy_score(y, clf.predict(X))}

In [None]:
kernel = ConstantKernel(1, constant_value_bounds=(0.1, 10.0)) * RBF(1, length_scale_bounds=(0.01, 10))
mf_gpc = mfgpc.MultiFidelityGaussianProcessClassifier(kernel = kernel, rho = 0.0, n_restarts_optimizer = 10, eval_gradient=True)

In [None]:
methods = {}
methods['ss_gpc'] = make_pipeline(StandardScaler(), GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10))
methods['ss_mf_gpc'] = SSMF(mf_gpc)
methods['ss_logit'] = make_pipeline(StandardScaler(), LogisticRegression())
methods['xgb'] = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05, subsample=0.85)
methods['major_vote'] = MajorClassClassifier()
#methods['hetmogp'] = SSMF(utils_hetmogp.HetmogpWrapeper())

In [None]:
methods = {}
methods['hetmogp'] = SSMF(utils_hetmogp.HetmogpWrapeper(M=20))

# artificial

In [None]:
import time

In [None]:
start_time = time.time()
for ID in range(0, 40):
    artdf = pd.read_csv('Datasets/artifitial/df_' + str(ID) + '.csv')
    X = artdf[list(filter(lambda x: x.find('feature') != -1, artdf.columns))].values
    y_gold = artdf['target_gold'].values
    full_dfs = []
    for c in [0.2, 0.4]:
        y_corrupted = artdf['taget_noisy_' + str(c)].values
        hf_dfs = []
        for hf in [75]:
            print(ID, c, hf)
            kwargs = {
              'X':X, 
              'y_lf':y_corrupted, 
              'y_hf':y_gold, 
              'y_groundtruth':y_gold, 
              'scoring':scoring,
              'test_size':2500, 
              'train_lf_size':hf*3, 
              'train_hf_size':hf, 
              'runs':3, 
              'verbose':True
            }
            #df = utils_2019.make_test_results_df(methods['ss_gpc'], 'ss_gpc', 'high-fidelity', kwargs)
            df = utils_2019.run_tests_all_clfs(methods, **kwargs)
            
            #assert False
            
            df['hf'] = hf
            hf_dfs.append(df)
        dfs = pd.concat(hf_dfs, ignore_index=True)
        dfs['noise'] = c
        full_dfs.append(dfs)
    full_dfs = pd.concat(full_dfs, ignore_index=True)
    full_dfs.to_csv('loggers/artifitial_baselines/2019_df_' + str(ID) + '_basic.csv', index=False)
    print(ID, ('%.1f' % ((time.time() - start_time)/60)) + ' min passed')
    #break

# real

In [None]:
aliases = ['diabetes', 'german', 'satimage-1', 'mushroom', 'splice', 'spambase', 'hypothyroid', 'waveform-40']

In [None]:
for alias in aliases:
    X, y = get_binary_dataset(alias)
    print(len(X))

In [None]:
start_time = time.time()
for alias in aliases:
    X, y = get_binary_dataset(alias)
    full_dfs = []
    for c in [0.2, 0.4]:
        np.random.seed(0)
        y_corrupted = (y + (np.random.rand(len(y)) < c).astype(int)) % 2
        #raise Exception
        hf_dfs = []
        
        for hf in tqdm([75]):
            kwargs = {
              'X':X, 
              'y_lf':y_corrupted, 
              'y_hf':y, 
              'y_groundtruth':y, 
              'scoring':scoring,
              'test_size':len(X) - hf*3 - 1, 
              'train_lf_size':hf*3, 
              'train_hf_size':hf, 
              'runs':3, 
              'verbose':True
            }
            df = utils_2019.run_tests_all_clfs(methods, **kwargs)
            df['hf'] = hf
            hf_dfs.append(df)
        dfs = pd.concat(hf_dfs, ignore_index=True)
        dfs['noise'] = c
        full_dfs.append(dfs)
    full_dfs = pd.concat(full_dfs, ignore_index=True)
    full_dfs.to_csv('loggers/artifitial_baselines/2019_df_' + alias + '_basic.csv', index=False)
    print(alias, ('%.1f' % ((time.time() - start_time)/60)) + ' min passed')

# musicgenre

In [None]:
def major_vote(series):
    return series.value_counts().index[0]

def random_vote(series):
    return series.sample(1).iloc[0]

In [None]:
data = pd.read_csv('Datasets/mturk-datasets/music_genre_classification/music_genre_gold.csv', delimiter=',')

In [None]:
class_encoder = {}
classes = data['class'].value_counts()
for i in range(len(classes)):
    class_encoder[classes.index[i]] = i
class_decoder = {v:k for k, v in class_encoder.items()}

In [None]:
data_mturk = pd.read_csv('Datasets/mturk-datasets/music_genre_classification/music_genre_mturk.csv', delimiter=',')

In [None]:
data_mturk['class_code'] = data_mturk['class'].map(class_encoder)

In [None]:
data_mturk_majority = data_mturk[['id', 'class']].groupby('id').agg(major_vote).reset_index()

In [None]:
data_mturk_majority.columns = ['id', 'hf_class']

In [None]:
hf_data = pd.merge(data, data_mturk_majority, left_on = 'id', right_on = 'id', how = 'inner')

In [None]:
np.random.seed(1)
data_mturk_random = data_mturk[['id', 'class']].groupby('id').agg(random_vote).reset_index()
data_mturk_random.columns = ['id', 'lf_class']

In [None]:
mf_data = pd.merge(hf_data, data_mturk_random, left_on = 'id', right_on = 'id', how = 'inner')

mf_data['class_code'] = mf_data['class'].map(class_encoder)
mf_data['hf_class_code'] = mf_data['hf_class'].map(class_encoder)
mf_data['lf_class_code'] = mf_data['lf_class'].map(class_encoder)

In [None]:
features = list(filter(lambda x: x.find('feature') != -1, data.columns))

In [None]:
for target_genre in class_encoder.values():
    X = mf_data[features].values
    y = mf_data['hf_class_code'].apply(lambda x: int(x == target_genre)).values
    y_corrupted = mf_data['lf_class_code'].apply(lambda x: int(x == target_genre)).values
    y_gold = mf_data['class_code'].apply(lambda x: int(x == target_genre)).values

    hf_dfs = []
    for hf in [75]:
        print(target_genre, hf)
        kwargs = {
              'X':X, 
              'y_lf':y_corrupted, 
              'y_hf':y, 
              'y_groundtruth':y_gold, 
              'scoring':scoring,
              'test_size':len(X) - hf*3 - 1, 
              'train_lf_size':hf*3, 
              'train_hf_size':hf, 
              'runs':3, 
              'verbose':True
            }
        df = utils_2019.run_tests_all_clfs(methods, **kwargs)
        df['hf'] = hf
        hf_dfs.append(df)
    dfs = pd.concat(hf_dfs, ignore_index=True)
    dfs.to_csv('loggers/artifitial_baselines/2019_df_' + 'musicgenre_' + class_decoder[target_genre] + '.csv', index=False)

# sentimentpolarity

In [None]:
data_mturk = pd.read_csv('Datasets/mturk-datasets/sentiment_polarity/polarity_mturk_lsa_topics.csv', delimiter=',')

In [None]:
hf_data = data_mturk[['id', 'class']].groupby('id').agg(major_vote)
hf_data.columns = ['class_hf']

lf_data = data_mturk[['id', 'class']].groupby('id').agg(random_vote)
lf_data.columns = ['class_lf']

tmp = pd.merge(hf_data, lf_data, left_index=True, right_index=True)

(tmp['class_hf'] == tmp['class_lf']).mean()

In [None]:
data = pd.read_csv('Datasets/mturk-datasets/sentiment_polarity/polarity_gold_lsa_topics.csv', delimiter=',')

In [None]:
data = pd.merge(data, tmp, left_on='id', right_index=True, how = 'inner')

In [None]:
X = data[list(filter(lambda x: x.find('TOPIC') != -1, data.columns))].values
y = (data['class_hf'].values == 'pos').astype(int)
y_corrupted = (data['class_lf'].values == 'pos').astype(int)
y_gold = (data['class'].values == 'pos').astype(int)

In [None]:
hf_dfs = []
for hf in [75]:
    print(target_genre, hf)
    kwargs = {
          'X':X, 
          'y_lf':y_corrupted, 
          'y_hf':y, 
          'y_groundtruth':y_gold, 
          'scoring':scoring,
          'test_size':len(X) - hf*3 - 1, 
          'train_lf_size':hf*3, 
          'train_hf_size':hf, 
          'runs':3*5, 
          'verbose':True
        }
    df = utils_2019.run_tests_all_clfs(methods, **kwargs)
    df['hf'] = hf
    hf_dfs.append(df)
dfs = pd.concat(hf_dfs, ignore_index=True)
dfs.to_csv('loggers/artifitial_baselines/2019_df_sentimentpolarity.csv', index=False)