In [36]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score

from sys import path

path.append("../analysis/utils/")

from utils import get_datasets
from local_utils import build_clf_beans

In [3]:
DATASETS = ["webkb"]#, "20ng", "acm", "reut"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

In [4]:
pd_path = "/home/welton/data/pd_datasets/__dset__.csv"
pd_datasets = get_datasets(DATASETS, path=pd_path, sep=';')

In [5]:
def load_stacking_probs(dataset: str, CLFS: list, train_test: str = "train"):

    probs = {}
    for clf in CLFS:
        probs[clf] = {}
        for fold in np.arange(10):
            probs[clf][fold] = {}
            prob_dir = f"/home/welton/data/clfs_output/split_10/{dataset}/10_folds/{clf}/{fold}/"
            if train_test == "train":
                file_path = f"{prob_dir}/train.npz"
                probs[clf][fold]["train"] = np.load(file_path)[f"X_tain"]
            elif train_test == "test":
                file_path = f"{prob_dir}/test.npz"
                probs[clf][fold]["test"] = np.load(file_path)[f"X_test"]
            else:
                file_path = f"{prob_dir}/train.npz"
                probs[clf][fold]["train"] = np.load(file_path)[f"X_train"]
                file_path = f"{prob_dir}/test.npz"
                probs[clf][fold]["test"] = np.load(file_path)[f"X_test"]
    return probs

def load_labels(dataset: str, fold: int):

    file_path = f"/home/welton/data/clfs_output/split_10/{dataset}/10_folds/lfr/{fold}/train.npz"
    y_train = np.load(file_path)["y_train"]
    file_path = f"/home/welton/data/clfs_output/split_10/{dataset}/10_folds/lfr/{fold}/test.npz"
    y_test = np.load(file_path)["y_test"]
    return y_train, y_test

In [35]:
def apply_hits_by_conf(probas, hits_by_conf):

    new_probas = np.zeros((probas.shape[0], probas.shape[1]))
    # For each sample.
    for row in np.arange(probas.shape[0]):
        # For each class's probability.
        for pos in np.arange(probas.shape[1]):
            probability = probas[row][pos]
            bean = np.trunc(probability * 10) / 10
            bean = 0.9 if bean >= 1 else bean
            if bean in hits_by_conf:
                new_probas[row][pos] = probability * hits_by_conf[bean]
    return new_probas

In [48]:

for dataset in DATASETS:
    print(dataset.upper())
    scores = []
    probs = load_stacking_probs(dataset, CLFS, "train_test")
    # For each fold.
    for fold in np.arange(10):
        y_train, y_test = load_labels(dataset, fold)
        C = len(np.unique(np.hstack([y_train, y_test])))
        majority = np.zeros((y_test.shape[0], C))
        # For each classifier.
        for clf in CLFS:
            # Get confidence hits rate.
            conf_freq, hit_counts = build_clf_beans(probs[clf][fold]["train"], y_train)
            hits_by_conf = { key: hit_counts[key] / conf_freq[key] if key in hit_counts else 0 for key in conf_freq }
            # Apply majority vote weighted by confidence rate.
            new_train_probas = apply_hits_by_conf(probs[clf][fold]["train"], hits_by_conf)
            new_test_probas = apply_hits_by_conf(probs[clf][fold]["test"], hits_by_conf)
            
            output_dir = f"/home/welton/data/meta_features/proba_by_conf/split_10/{dataset}/10_folds/{clf}/{fold}/"
            os.makedirs(output_dir, exist_ok=True)

            np.savez(f"{output_dir}/train", X_train=new_train_probas, y_train=y_train)
            np.savez(f"{output_dir}/test", X_test=new_test_probas, y_test=y_test)


WEBKB


In [47]:
np.load("/home/welton/data/meta_features/proba_by_conf/split_10/webkb/10_folds/kfr/0/train.npz")["X_train"].tolist()

[[0.33883388338833886, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.3852040816326531, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.4237244897959183, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.11910669975186106, 0.0, 0.0, 0.11910669975186106, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.5018939393939394, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.8196957566052843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.2363740676993689, 0.1105