In [44]:
import os
from collections import Counter

import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from optuna.integration import OptunaSearchCV
from optuna.distributions import (IntDistribution,
                                    FloatDistribution)
from joblib import load, dump

from local_utils import load_stacking_probs, build_clf_beans

# Configs Optuna
import warnings
from optuna.exceptions import ExperimentalWarning
from optuna.logging import set_verbosity, WARNING

set_verbosity(WARNING)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [53]:
DATASETS = ["webkb"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]
CLFS = ["rep_bert", "sfr", "lpr", "ltr", "xtmk"]
ORACLE_DIR = "/home/welton/data/oracle"
ESTIMATOR_NAME = "xgboost"

In [54]:
dataset = DATASETS[0]
probas = load_stacking_probs(dataset, CLFS, "train_test")

In [60]:
def build_means(probas, fold, train_test):

    upper_bound = {}
    for clf in CLFS:
        if train_test == "train":
            upper_bound[clf] = np.load(f"/home/welton/data/oracle/upper_test/{dataset}/{clf}/{fold}/{train_test}.npz")['y']
        else:
            upper_bound[clf] = np.load(f"/home/welton/data/oracle/local_gbm_75/{dataset}/{clf}/{fold}/{train_test}.npz")['y']
    
    mean_vecs = []
    # Para cada documento.
    for idx in np.arange(probas['rep_bert'][fold][train_test].shape[0]):
        docs_vecs = []
        for clf in CLFS:
            # Se o classificador acertou a predição.
            if upper_bound[clf][idx] == 1:
                docs_vecs.append(probas[clf][fold][train_test][idx])
        # Se ao menos um classificador acertou.
        if docs_vecs:
            docs_vecs = np.mean(docs_vecs, axis=0)
        else:
            docs_vecs = np.zeros(probas['rep_bert'][fold][train_test].shape[1])
        mean_vecs.append(docs_vecs)
    return np.vstack(mean_vecs), upper_bound

def build_rep(probas, upper_bound, mean_vecs, fold, train_test):

    meta_features = []
    # Para cada documento.
    for idx in np.arange(probas['rep_bert'][fold][train_test].shape[0]):
        docs_vecs = []
        for clf in CLFS:
            # Se o classificador acertou a predição use a predição dele.
            if upper_bound[clf][idx] == 1:
                docs_vecs.append(probas[clf][fold][train_test][idx])
            # Se não use a média dos que acertaram.
            else:
                docs_vecs.append(mean_vecs[idx])
        docs_vecs = np.hstack(docs_vecs)
        meta_features.append(docs_vecs)
    return np.vstack(meta_features)

In [61]:
for fold in np.arange(10):
    
    mean_vecs, upper_bound = build_means(probas, fold, "train")
    X_train = build_rep(probas, upper_bound, mean_vecs, fold, "train")
    output_dir = f"{ORACLE_DIR}/upper_mean/{dataset}/all_clfs/{fold}"
    os.makedirs(output_dir, exist_ok=True)
    np.savez(f"{output_dir}/train", X_train=X_train)

    mean_vecs, upper_bound = build_means(probas, fold, "test")
    X_test = build_rep(probas, upper_bound, mean_vecs, fold, "test")
    output_dir = f"{ORACLE_DIR}/upper_mean/{dataset}/all_clfs/{fold}"
    os.makedirs(output_dir, exist_ok=True)
    np.savez(f"{output_dir}/test", X_test=X_test)

In [49]:
meta_features

array([[8.33333333e-01, 1.66666667e-01, 0.00000000e+00, ...,
        4.21319244e-04, 5.94284793e-04, 3.99486191e-04],
       [8.46153846e-01, 0.00000000e+00, 0.00000000e+00, ...,
        2.18985413e-04, 4.91496059e-04, 2.07096644e-04],
       [7.85714286e-01, 7.14285714e-02, 0.00000000e+00, ...,
        3.79652047e-04, 5.02866087e-03, 3.04593123e-04],
       ...,
       [7.69230769e-02, 7.69230769e-02, 0.00000000e+00, ...,
        3.65561433e-03, 7.89485499e-03, 9.73121524e-01],
       [0.00000000e+00, 4.16666667e-01, 0.00000000e+00, ...,
        1.44314524e-02, 5.46286954e-03, 7.20474720e-01],
       [0.00000000e+00, 1.53846154e-01, 0.00000000e+00, ...,
        2.58800550e-03, 2.26827920e-03, 9.89969313e-01]])

In [50]:
np.load(f"{ORACLE_DIR}/upper_mean/webkb/all_clfs/0/train.npz")["X_train"].shape

(7376, 126)