In [1]:
import os
from collections import Counter

import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from optuna.integration import OptunaSearchCV
from optuna.distributions import (IntDistribution,
                                    FloatDistribution)
from joblib import load, dump

from local_utils import load_stacking_probs, build_clf_beans

# Configs Optuna
import warnings
from optuna.exceptions import ExperimentalWarning
from optuna.logging import set_verbosity, WARNING

set_verbosity(WARNING)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
DATASETS = ["20ng"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]
#CLFS = ["kpr", "lfr"]
ORACLE_DIR = "/home/welton/data/oracle"
ESTIMATOR_NAME = "xgboost"

In [11]:
def agreement_mfs(probas, clf_target, fold, train_test):

    main_preds = probas[clf_target][fold][train_test].argmax(axis=1)
    preds = [ probas[clf][fold][train_test].argmax(axis=1) for clf in probas if clf != clf_target ]
    preds = np.vstack(preds).T

    div = []
    #agree_classes = []
    agree_sizes = []
    for idx in np.arange(main_preds.shape[0]):
        pred_class, agree_size = Counter(preds[idx]).most_common()[0]
        if pred_class == main_preds[idx]:
            div.append(0)
        else:
            div.append(1)
        #agree_classes.append(pred_class)
        agree_sizes.append(agree_size)

    #return div, agree_classes, agree_sizes
    return np.array(div), np.array(agree_sizes)

def confidence_rate(probas, labels):

    conf_hit = []
    conf_freq, hits = build_clf_beans(probas, labels)
    hits_rate = { np.trunc(bean*10)/10 : hits[bean] / conf_freq[bean] if bean in hits else 0 for bean in np.arange(0, 1, 0.1) }
    preds = probas.argmax(axis=1)
    for idx, predicted_class in enumerate(preds):
        # Getting the probability of the predicted class
        probability = probas[idx][predicted_class] * 10
        bean = np.trunc(probability) / 10
        bean = 0.9 if bean >= 1 else bean
        conf_hit.append(hits_rate[bean])
    return np.array(conf_hit)

def hits_rate_by_class(probas, labels):

    class_hits_rate = {}
    preds = probas.argmax(axis=1)
    # Vector with hits and misses.
    hits = preds == labels
    # For each label.
    for label in np.unique(labels):
        # Get the docs of the label.
        class_docs = labels == label
        class_hits_rate[label] = np.sum(hits[class_docs]) / np.sum(class_docs)
    return np.array([ class_hits_rate[p] for p in preds ])

def class_weights(probas, labels):

    cw = { label:np.sum(labels == label) / labels.shape[0] for label in np.unique(labels) }
    preds = probas.argmax(axis=1)
    return np.array([ cw[p] for p in preds ])

def get_clf(clf = "xgboost", n_jobs=5):

    if clf == "xgboost":
        CLF_XGB = XGBClassifier(random_state=42, verbosity=0, n_jobs=n_jobs, tree_method="gpu_hist")
        HYP_XGB = {
            "n_estimators": IntDistribution(low=100, high=1000, step=50),
            "learning_rate": FloatDistribution(low=.01, high=.5),
            "eta": FloatDistribution(low=.025, high=.5),
            "max_depth": IntDistribution(low=1, high=14),
            "subsample": FloatDistribution(low=.5, high=1.),
            "gamma": FloatDistribution(low=1e-8, high=1.),
            "colsample_bytree": FloatDistribution(low=.5, high=1.)
        }
        return CLF_XGB, HYP_XGB
    else:
        HYP_GBM = {}
        return GradientBoostingClassifier(), HYP_GBM

def execute_optimization(
        classifier_name: str,
        file_model: str,
        X_train: np.ndarray,
        y_train: np.ndarray,
        *,
        opt_cv: int = 4,
        opt_n_iter: int = 30,
        opt_scoring: str = "f1_macro",
        opt_n_jobs: int = 5,
        clf_n_jobs: int = 5,
        seed: int = 42,
        load_model: bool = False
) -> BaseEstimator:

    classifier, hyperparameters = get_clf(classifier_name, n_jobs=clf_n_jobs)
    pipeline = Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("classifier", classifier)
    ])
    hyperparameters = {f"classifier__{k}": v for k, v in hyperparameters.items()}

    optuna_search = OptunaSearchCV(
        pipeline,
        hyperparameters,
        cv=StratifiedKFold(opt_cv, shuffle=True, random_state=seed),
        error_score="raise",
        n_trials=opt_n_iter,
        random_state=seed,
        scoring=opt_scoring,
        n_jobs=opt_n_jobs
    )

    if load_model and os.path.exists(file_model):
        print("\tModel already executed! Loading model...", end="")
        optuna_search = load(file_model)
    else:
        print("\tExecuting model...", end="")
        optuna_search.fit(X_train, y_train)
        dump(optuna_search, file_model)

    return optuna_search


In [12]:
dataset = DATASETS[0]
probas = load_stacking_probs(dataset, CLFS, "train_test")

In [23]:
scores = []
# For each fold.
for fold in np.arange(10):
    # Loading labels.
    y_train = np.load(f"/home/welton/data/datasets/labels/split_10/{dataset}/{fold}/train.npy")
    y_test = np.load(f"/home/welton/data/datasets/labels/split_10/{dataset}/{fold}/test.npy")
    # Load fold Meta-Features (Washington).
    dist_train = csr_matrix(np.load(f"/home/welton/data/meta_features/features/dist/{fold}/{dataset}/train.npz")["X_train"]).toarray()
    dist_test = csr_matrix(np.load(f"/home/welton/data/meta_features/features/dist/{fold}/{dataset}/test.npz")["X_test"]).toarray()
    # For each Stacking base model.
    for target_clf in ["lfr"]:

        # Building Meta-Features.
        probas_train = probas[target_clf][fold]["train"]
        probas_test = probas[target_clf][fold]["test"]
        cw_train = class_weights(probas_train, y_train)
        cw_test = class_weights(probas_test, y_test)
        hrc_train = hits_rate_by_class(probas_train, y_train)
        hrc_test = hits_rate_by_class(probas_test, y_test)
        conf_train = confidence_rate(probas_train, y_train)
        conf_test = confidence_rate(probas_test, y_test)
        div_train, ags_train = agreement_mfs(probas, target_clf, fold, "train")
        div_test, ags_test = agreement_mfs(probas, target_clf, fold, "test")
        scaled_ags_train = MinMaxScaler().fit_transform(ags_train.reshape(-1, 1)).reshape(-1)
        scaled_ags_test = MinMaxScaler().fit_transform(ags_test.reshape(-1, 1)).reshape(-1)

        # Joining Meta-Features.
        X_train = np.vstack([
            cw_train,
            hrc_train,
            conf_train,
            div_train,
            ags_train,
            scaled_ags_train
        ]).T

        X_train = np.hstack([probas_train, dist_train, X_train])
        X_test = np.vstack([
            cw_test,
            hrc_test,
            conf_test,
            div_test,
            ags_test,
            scaled_ags_test
        ]).T
        X_test = np.hstack([probas_test, dist_test, X_test])

        
        # Making labels (hit or missed)
        preds_train = probas_train.argmax(axis=1)
        upper_train = np.zeros(preds_train.shape[0])
        upper_train[preds_train == y_train] = 1

        preds_test = probas_test.argmax(axis=1)
        upper_test = np.zeros(preds_test.shape[0])
        upper_test[preds_test == y_test] = 1

        output_dir = f"{ORACLE_DIR}/{dataset}/{target_clf}/{fold}"
        os.makedirs(output_dir, exist_ok=True)
        file_model = f"{output_dir}/model"

        """
        error_estimator = XGBClassifier(
                n_estimators=300,
                learning_rate=0.11,
                max_depth=11,
                booster="gbtree",
                colsample_bytree=0.650026359170959,
                random_state=42,
                verbosity=0,
                n_jobs=1,
                tree_method='gpu_hist')

        error_estimator.fit(X_train[:1000], upper_train[:1000])

        # Prediction
        y_pred = error_estimator.predict(X_test)
        
        scores.append([target_clf,
        precision_score(upper_test, y_pred, pos_label=0),
        recall_score(upper_test, y_pred, pos_label=0),
        f1_score(upper_test, y_pred, pos_label=0),
        accuracy_score(upper_test, y_pred)])
        """

In [24]:
X_train.shape, X_test.shape, upper_train.shape, upper_test.shape

((16970, 466), (1876, 466), (16970,), (1876,))

In [18]:
probas["stmk"][2]['train'].shape, probas["stmk"][2]['test'].shape

((16956, 20), (1890, 20))

In [98]:
def get_f1(X_train, X_test, y_train, y_test):
    
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.11,
        max_depth=11,
        booster="gbtree",
        colsample_bytree=0.650026359170959,
        random_state=42,
        verbosity=0,
        n_jobs=1,
        tree_method='gpu_hist')

    # Training xgb.
    _ = xgb.fit(X_train, y_train)
    # Evaluating xgb's performance.
    y_pred = xgb.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"F1: {f1}")
    return f1

def feature_selection(X_train, X_test, y_train, y_test):
    # Training RF and building feature importance ranking.
    forest = RandomForestClassifier(n_estimators=300, max_depth=11, random_state=42, n_jobs=25)
    _ = forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    ranking = (1 - importances).argsort()

    pick = 5
    gap = 10
    best_f1 = -1
    best_pos = -1
    # se é para subir a busca ou descer.
    improve = True

    # Enquanto houver pontos para busca.
    while True:
        # Teste o modelo com 'pick' features (Aqui pode ser o XGBoost na GPU).
        feats_ids = ranking[:pick]
        f1 = get_f1(X_train[:, feats_ids], X_test[:, feats_ids], y_train, y_test)
        # Se a macro de agora for melhor que a última.
        if best_f1 < f1:
            best_f1 = f1
            best_pos = pick
            improve = True
        # Se a macro de agora não for melhor que a última encurte o salto.
        else:
            improve = False
            gap = max(gap // 2, 1)
            improve = False
        pick = best_pos + gap
        
        if gap == 1 and not improve:
            break

    gap = 10
    improve = True
    pick = best_pos - gap + 1
    first_best_pos = best_pos
    # Enquanto houver pontos para busca.
    while pick < first_best_pos:
        # Teste o modelo com 'pick' features (Aqui pode ser o XGBoost na GPU).
        feats_ids = ranking[:pick]
        f1 = get_f1(X_train[:, feats_ids.tolist()], X_test[:, feats_ids.tolist()], y_train, y_test)
        # Se a macro de agora for melhor que a última.
        if best_f1 < f1:
            best_f1 = f1
            best_pos = pick
            improve = True
        # Se a macro de agora não for melhor que a última encurte o salto.
        else:
            improve = False
            gap = gap // 2
            improve = False
        
        pick = first_best_pos - gap
    print(f"F1: {best_f1} POS: {best_pos}")

    return best_pos, forest

In [99]:
best_pos, forest = feature_selection(X_train, X_test, y_train, y_test)

F1: 0.8230554411911337
F1: 0.831950603343813
F1: 0.8353124483545731
F1: 0.83206159154604
F1: 0.8303922209867708
F1: 0.8322875521993938
F1: 0.8300752996144113
F1: 0.8336074921228812
F1: 0.8358549506847928
F1: 0.8358549506847928
F1: 0.8358121437377386
F1: 0.8358549506847928 POS: 23


In [11]:
for s in scores:
    print(f"{s[0]};{s[1]*100};{s[2]*100};{s[3]*100};{s[4]*100}".replace('.', ','))

kpr;26,59974905897114;97,6958525345622;41,81459566074951;28,311057108140947
lfr;22,595419847328245;78,3068783068783;35,07109004739336;33,41433778857837
kpr;26,879699248120303;65,0;38,03191489361702;43,30900243309003
lfr;22,85276073619632;75,25252525252525;35,05882352941177;32,846715328467155
kpr;26,7741935483871;71,55172413793103;38,96713615023475;36,662606577344704
lfr;23,06477093206951;70,53140096618358;34,761904761904766;33,252131546894034
kpr;24,27652733118971;69,26605504587155;35,95238095238095;34,47015834348356
lfr;23,625557206537888;85,94594594594595;37,06293706293706;34,22655298416565
kpr;29,97448979591837;94,75806451612904;45,54263565891474;31,5468940316687
lfr;22,755417956656345;76,5625;35,08353221957041;33,73934226552984
kpr;21,875;49,763033175355446;30,39073806078148;41,34146341463414
lfr;24,085365853658537;82,29166666666666;37,264150943396224;35,12195121951219
kpr;27,95969773299748;99,10714285714286;43,614931237721024;29,914529914529915
lfr;23,23076923076923;79,47368421052

In [100]:
m = np.array([
    [1,2,3,4,5],
    [1,2,3,4,5],
    [1,2,3,4,5],
    [1,2,3,4,5],
    [1,2,3,4,5]
])
m

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [101]:
m[:, [1,3]]

array([[2, 4],
       [2, 4],
       [2, 4],
       [2, 4],
       [2, 4]])