In [1]:
import numpy as np
import pandas as pd
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = "/home/welton/data"
PROBS_DIR = f"{DATA_DIR}/normal_probas/split_10"
LABELS_DIR = f"{DATA_DIR}/datasets/labels/split_10"
UPPER_DIR = f"{DATA_DIR}/oracle/upper_bound/normal_probas"
STACKING_DIR = f"{DATA_DIR}/stacking/stacking_output"
CLFS = ["bert", "xlnet", "ktmk", "ktr", "lstmk", "lstr", "ltr"]
N_FOLDS = 10

In [3]:
clfs_stack = '/'.join(sorted([f"{clf}_normal_probas" for clf in CLFS ]))
clfs_stack

'bert_normal_probas/ktmk_normal_probas/ktr_normal_probas/lstmk_normal_probas/lstr_normal_probas/ltr_normal_probas/xlnet_normal_probas'

In [4]:
def load_probs(probs_source: str,
               dataset: str,
               clfs: list,
               fold: int):

    probas = {}
    for clf in clfs:
        probas[clf] = {}
        probas_dir = f"{probs_source}/{dataset}/10_folds/{clf}/{fold}/"
        train_load = np.load(f"{probas_dir}/train.npz")
        test_load = np.load(f"{probas_dir}/test.npz")
        probas[clf]["X_train"] = train_load["X_train"]
        probas[clf]["X_test"] = test_load["X_test"]
    return probas

def load_y(labels_dir: int, dataset: str, fold: int):

    ldir = f"{labels_dir}/{dataset}/{fold}"
    return np.load(f"{ldir}/train.npy"), np.load(f"{ldir}/test.npy")

def load_upper(data_source: str, dataset: str, clfs: str, fold: int):

    upper = {}
    for clf in clfs:
        upper[clf] = {}
        upper_dir = f"{data_source}/{dataset}/10_folds/{clf}/{fold}/"
        upper[clf]["y_train"] = np.load(f"{upper_dir}/train.npz")['y']
        upper[clf]["y_test"] = np.load(f"{upper_dir}/test.npz")['y']
    return upper

def fix_probas(clfs: list, y: np.ndarray, probas: dict, alfa: float):
    
    new_probas = []
    for clf in clfs:
        clf_probas = []
        y_pred = probas[clf]["X_test"].argmax(axis=1)
        for doc_idx, doc_proba in enumerate(probas[clf]["X_test"]):
            class_proba = y_pred[doc_idx]
            bucket = np.trunc(doc_proba[class_proba] * 10) / 10
            bucket = doc_proba[class_proba]
            if y_pred[doc_idx] != y[doc_idx] and bucket == alfa:
                clf_probas.append(np.zeros(doc_proba.shape[0]))
            else:
                clf_probas.append(doc_proba)
        new_probas.append(np.vstack(clf_probas))
    return np.hstack(new_probas)



In [5]:
meta_layer = "10_folds/logistic_regression/normal_probas"
dataset = "acm"

In [6]:
probas = load_probs(PROBS_DIR, "acm", CLFS, 0)

In [7]:
uppers = load_upper(UPPER_DIR, "acm", CLFS, 0)

In [10]:
stats = {}

max_fixes = 0
for fold in np.arange(10):
    
    stats[fold] = {}

    y_train, y_test = load_y(LABELS_DIR, dataset, fold)
    probas = load_probs(PROBS_DIR, dataset, CLFS, fold)
    model = load(f"{STACKING_DIR}/{dataset}/{meta_layer}/{clfs_stack}/fold_{fold}/model.joblib")
    
    for alfa in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
        
        X_test = fix_probas(CLFS, y_test, probas,  alfa)
        y_pred = model.predict(X_test)
        stats[fold][alfa] = f1_score(y_test, y_pred, average="macro")

In [11]:
mean_alfas = { interval: 0 for interval in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] }
for interval in mean_alfas:
    for fold in np.arange(10):
        mean_alfas[interval] += stats[fold][interval] / 10

In [12]:
mean_alfas

{0.0: 0.7395255498713335,
 0.1: 0.7395255498713335,
 0.2: 0.7395561740122022,
 0.3: 0.7396262544244857,
 0.4: 0.7395890697907915,
 0.5: 0.7395971831847278,
 0.6: 0.7397870396389669,
 0.7: 0.7396918945771279,
 0.8: 0.7397553117099712,
 0.9: 0.7399002885368456,
 1: 0.7396813819926662}

In [None]:
73,95
73,95
73,95
73,96
73,95
73,95
73,97
73,96
73,97
73,99
73,96