In [1]:
import os
from collections import Counter

import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from optuna.integration import OptunaSearchCV
from optuna.distributions import (IntDistribution,
                                    FloatDistribution)
from joblib import load, dump

from local_utils import load_stacking_probs, build_clf_beans

# Configs Optuna
import warnings
from optuna.exceptions import ExperimentalWarning
from optuna.logging import set_verbosity, WARNING

set_verbosity(WARNING)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
DATASETS = ["webkb"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]
ORACLE_DIR = "/home/welton/data/oracle"
ESTIMATOR_NAME = "xgboost"

In [33]:
def agreement_mfs(probas, clf_target, fold, train_test):

    main_preds = probas[clf_target][fold][train_test].argmax(axis=1)
    preds = [ probas[clf][fold][train_test].argmax(axis=1) for clf in probas if clf != clf_target ]
    preds = np.vstack(preds).T

    div = []
    #agree_classes = []
    agree_sizes = []
    for idx in np.arange(main_preds.shape[0]):
        pred_class, agree_size = Counter(preds[idx]).most_common()[0]
        if pred_class == main_preds[idx]:
            div.append(0)
        else:
            div.append(1)
        #agree_classes.append(pred_class)
        agree_sizes.append(agree_size)

    #return div, agree_classes, agree_sizes
    return np.array(div), np.array(agree_sizes)

def confidence_rate(probas, labels):

    conf_hit = []
    conf_freq, hits = build_clf_beans(probas, labels)
    hits_rate = { np.trunc(bean*10)/10 : hits[bean] / conf_freq[bean] if bean in hits else 0 for bean in np.arange(0, 1, 0.1) }
    preds = probas.argmax(axis=1)
    for idx, predicted_class in enumerate(preds):
        # Getting the probability of the predicted class
        probability = probas[idx][predicted_class] * 10
        bean = np.trunc(probability) / 10
        bean = 0.9 if bean >= 1 else bean
        conf_hit.append(hits_rate[bean])
    return np.array(conf_hit)

def hits_rate_by_class(probas, labels):

    class_hits_rate = {}
    preds = probas.argmax(axis=1)
    # Vector with hits and misses.
    hits = preds == labels
    # For each label.
    for label in np.unique(labels):
        # Get the docs of the label.
        class_docs = labels == label
        class_hits_rate[label] = np.sum(hits[class_docs]) / np.sum(class_docs)
    return np.array([ class_hits_rate[p] for p in preds ])

def class_weights(probas, labels):

    cw = { label:np.sum(labels == label) / labels.shape[0] for label in np.unique(labels) }
    preds = probas.argmax(axis=1)
    return np.array([ cw[p] for p in preds ])

def get_clf(clf = "xgboost", n_jobs=5):

    if clf == "xgboost":
        CLF_XGB = XGBClassifier(random_state=42, verbosity=0, n_jobs=n_jobs)
        HYP_XGB = {
            "n_estimators": IntDistribution(low=100, high=1000, step=50),
            "learning_rate": FloatDistribution(low=.01, high=.5),
            "eta": FloatDistribution(low=.025, high=.5),
            "max_depth": IntDistribution(low=1, high=14),
            "subsample": FloatDistribution(low=.5, high=1.),
            "gamma": FloatDistribution(low=1e-8, high=1.),
            "colsample_bytree": FloatDistribution(low=.5, high=1.)
        }
        return CLF_XGB, HYP_XGB
    else:
        HYP_GBM = {}
        return GradientBoostingClassifier(), HYP_GBM

def execute_optimization(
        classifier_name: str,
        file_model: str,
        X_train: np.ndarray,
        y_train: np.ndarray,
        *,
        opt_cv: int = 4,
        opt_n_iter: int = 30,
        opt_scoring: str = "f1_macro",
        opt_n_jobs: int = 5,
        clf_n_jobs: int = 5,
        seed: int = 42,
        load_model: bool = False
) -> BaseEstimator:

    classifier, hyperparameters = get_clf(classifier_name, n_jobs=clf_n_jobs)
    pipeline = Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("classifier", classifier)
    ])
    hyperparameters = {f"classifier__{k}": v for k, v in hyperparameters.items()}

    optuna_search = OptunaSearchCV(
        pipeline,
        hyperparameters,
        cv=StratifiedKFold(opt_cv, shuffle=True, random_state=seed),
        error_score="raise",
        n_trials=opt_n_iter,
        random_state=seed,
        scoring=opt_scoring,
        n_jobs=opt_n_jobs
    )

    if load_model and os.path.exists(file_model):
        print("\tModel already executed! Loading model...", end="")
        optuna_search = load(file_model)
    else:
        print("\tExecuting model...", end="")
        optuna_search.fit(X_train, y_train)
        dump(optuna_search, file_model)

    return optuna_search


In [12]:
dataset = DATASETS[0]
probas = load_stacking_probs(dataset, CLFS, "train_test")

In [None]:
global_X_train = []
global_X_test = []
global_upper_train = []
global_upper_test = []
global_y_train = []
global_y_test = []

dist_train = csr_matrix(np.load(f"/home/welton/data/meta_features/features/dist/{fold}/{dataset}/train.npz")["X_train"]).toarray()
dist_test = csr_matrix(np.load(f"/home/welton/data/meta_features/features/dist/{fold}/{dataset}/test.npz")["X_test"]).toarray()
for target_clf in CLFS:
    print(target_clf.upper())
    probas_train = probas[target_clf][fold]["train"]
    probas_test = probas[target_clf][fold]["test"]
    cw_train = class_weights(probas_train, y_train)
    cw_test = class_weights(probas_test, y_test)
    hrc_train = hits_rate_by_class(probas_train, y_train)
    hrc_test = hits_rate_by_class(probas_test, y_test)
    conf_train = confidence_rate(probas_train, y_train)
    conf_test = confidence_rate(probas_test, y_test)
    div_train, ags_train = agreement_mfs(probas, target_clf, fold, "train")
    div_test, ags_test = agreement_mfs(probas, target_clf, fold, "test")
    scaled_ags_train = MinMaxScaler().fit_transform(ags_train.reshape(-1, 1)).reshape(-1)
    scaled_ags_test = MinMaxScaler().fit_transform(ags_test.reshape(-1, 1)).reshape(-1)

    X_train = np.vstack([
        cw_train,
        hrc_train,
        conf_train,
        div_train,
        ags_train,
        scaled_ags_train
    ]).T

    X_train = np.hstack([probas_train, dist_train, X_train])
    X_test = np.vstack([
        cw_test,
        hrc_test,
        conf_test,
        div_test,
        ags_test,
        scaled_ags_test
    ]).T

    X_test = np.hstack([probas_test, dist_test, X_test])

    global_X_train.append(X_train)
    global_X_test.append(X_test)

    scores = []
    preds_train = probas_train.argmax(axis=1)
    upper_train = np.zeros(preds_train.shape[0])
    upper_train[preds_train == y_train] = 1
    global_upper_train.append(upper_train)

    preds_test = probas_test.argmax(axis=1)
    upper_test = np.zeros(preds_test.shape[0])
    upper_test[preds_test == y_test] = 1
    global_upper_test.append(upper_test)

    global_y_train.append(y_train)
    global_y_test.append(y_test)

global_X_train = np.vstack(global_X_train)
global_upper_train = np.hstack(global_upper_train)
upper_test = np.hstack(global_upper_test)


error_estimator, _ = get_clf("xgboost", n_jobs=15)

_ = error_estimator.fit(global_X_train, global_upper_train)
y_pred = error_estimator.predict(np.vstack(global_X_test))

scores.append([target_clf,
precision_score(upper_test, y_pred, pos_label=0),
recall_score(upper_test, y_pred, pos_label=0),
f1_score(upper_test, y_pred, pos_label=0),
accuracy_score(upper_test, y_pred)])


In [None]:
for X_test, upper_test, alg in zip(global_X_test, global_upper_test, CLFS):
    y_pred = clf.predict(np.vstack(X_test))
    pc = precision_score(upper_test, y_pred, pos_label=0)
    rc = recall_score(upper_test, y_pred, pos_label=0)
    f1 = f1_score(upper_test, y_pred, pos_label=0)
    acc = accuracy_score(upper_test, y_pred)
    print(f"{alg};{pc*100};{rc*100};{f1*100};{acc*100}".replace('.',','))


In [5]:
np.unique(np.load("/home/welton/data/oracle/global_xgboost/clfs/webkb/0/train.npz")['y'], return_counts=True)

(array([1.]), array([7376]))

In [6]:
np.unique(np.load("/home/welton/data/oracle/global_xgboost/clfs/webkb/0/test.npz")['y'], return_counts=True)

(array([0, 1]), array([288, 535]))

In [14]:
x = FloatDistribution(low=1e-8, high=1.)

In [1]:
from joblib import load, dump
import numpy as np

In [3]:
model = load("/home/welton/data/oracle/global_xgboost/error_estimator_bkp/webkb/0/model")

In [3]:
model.best_params_

{'classifier__n_estimators': 300,
 'classifier__learning_rate': 0.11043661153099077,
 'classifier__max_depth': 11,
 'classifier__subsample': 0.8293121693743628,
 'classifier__booster': 'gbtree',
 'classifier__colsample_bytree': 0.650026359170959}

In [4]:
!cat /home/welton/data/oracle/global_xgboost/clfs/webkb/kfr/0/scoring.json

{"f1_macro": 78.19148936170212, "precision": 79.45945945945945, "recall": 76.96335078534031, "accuracy": 90.03645200486027}

In [8]:
!cat /home/welton/data/oracle/global_xgboost/clfs/webkb/rep_bert/0/scoring.json

{"f1_macro": 44.57831325301205, "precision": 50.68493150684932, "recall": 39.784946236559136, "accuracy": 88.82138517618469}

In [34]:
np.unique(np.load("/home/welton/data/oracle/global_xgboost/clfs/webkb/kfr/0/test.npz")['y'], return_counts=True)

(array([5.18560410e-06, 6.79492950e-06, 7.80820847e-06, 8.82148743e-06,
        9.59634781e-06, 1.16229057e-05, 1.23381615e-05, 1.30534172e-05,
        1.31130219e-05, 1.31726265e-05, 1.33514404e-05, 1.45435333e-05,
        1.47819519e-05, 1.55568123e-05, 1.59740448e-05, 1.64508820e-05,
        1.72257423e-05, 2.08616257e-05, 2.18749046e-05, 2.19345093e-05,
        2.29477882e-05, 2.33054161e-05, 2.34842300e-05, 2.44975090e-05,
        2.47955322e-05, 2.51531601e-05, 2.54511833e-05, 3.10540199e-05,
        3.14116478e-05, 3.29017639e-05, 3.29613686e-05, 3.60608101e-05,
        3.65376472e-05, 3.68952751e-05, 3.92198563e-05, 4.00543213e-05,
        4.11272049e-05, 4.19616699e-05, 4.21404839e-05, 4.58359718e-05,
        4.89354134e-05, 5.07235527e-05, 5.69224358e-05, 6.06179237e-05,
        6.07967377e-05, 6.31809235e-05, 6.52074814e-05, 6.59227371e-05,
        6.63995743e-05, 6.72936440e-05, 6.86645508e-05, 7.06911087e-05,
        7.09891319e-05, 7.30752945e-05, 7.56382942e-05, 7.772445