In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, precision_score, recall_score

from sys import path

path.append("../analysis/utils/")

from utils import get_datasets

In [None]:
DATASETS = ["webkb", "20ng", "acm", "reut"]

CLFS = ["rep_bert"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

THRESHOLD = 0.3

In [None]:
pd_datasets = get_datasets(DATASETS, path="../../data/pd_datasets/__dset__.csv", sep=';')

In [None]:
def build_clf_beans(clf_probas, label):
    predictions = clf_probas.argmax(axis=1)
    confidence_freq = {}
    hits = {}
    # For each prediction
    for idx, predicted_class in enumerate(predictions):
        
        # Getting the probability of the predicted class
        probability = clf_probas[idx][predicted_class] * 10
        bean = np.trunc(probability) / 10
        bean = 0.9 if bean >= 1 else bean
        # Adding the bean in confidence if is not there yet.
        if bean not in confidence_freq:
            confidence_freq[bean] = 0
        confidence_freq[bean] += 1
        # Veryfing if the predicted class was right.
        if predicted_class == label[idx]:
            if bean not in hits:
                hits[bean] = 0
            hits[bean] += 1
    return confidence_freq, hits

def get_miss_predictor(confidence_freq, hits, threshold=0.3):

    predictor = {}
    # For each confidence interval.
    for bean in hits:
        # Get the hit rate.
        hits_rate = hits[bean] / confidence_freq[bean]
        
        if hits_rate < threshold:
            predictor[bean] = True
    return predictor

def predict(X, estimator):
    
    estimates = []
    predictions = X.argmax(axis=1)
    # For each prediction.
    for idx, predicted_class in enumerate(predictions):
        probability = X[idx][predicted_class] * 10
        bean = np.trunc(probability) / 10
        bean = 0.9 if bean >= 1 else bean
        # If this confidence has a miss rate greater than THRESHOLD (wether it is in the dictionary or not)
        if bean in estimator:
            estimates.append(0)
        else:
            estimates.append(1)
    return np.array(estimates)

In [None]:
scores = []
for dset in DATASETS:
    print(f"{dset.upper()}")
    for clf in CLFS:
        print(f"\t{clf.upper()}")
        for fold in np.arange(10):
            probs_dir = f"/home/welton/data/clfs_output/split_10/{dset}/10_folds/{clf}/{fold}"
            # Loading probabilities.
            X_train = np.load(f"{probs_dir}/train.npz")["X_train"]
            labels_dir = f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}"
            train_labels = np.load(f"{labels_dir}/train.npy")
            
            X_test = np.load(f"{probs_dir}/test.npz")["X_test"]
            
            # Building error estimator.
            confidence_freq, hits = build_clf_beans(X_train, train_labels)
            estimator = get_miss_predictor(confidence_freq, hits, THRESHOLD)
            
            # Applying estimator on train and test.
            train_est = predict(X_train, estimator)
            test_est = predict(X_test, estimator)
            
            ## Saving the new probabilities (features for the meta-layer).
            #output_dir = f"/home/welton/data/oracle/hits_rate/{THRESHOLD}/{dset}/{clf}/{fold}"
            #os.makedirs(output_dir, exist_ok=True)
            #np.savez(f"{output_dir}/test", y=test_est)
            #np.savez(f"{output_dir}/train", y=train_est)

            # Saving probabilities to hits_rate_test
            output_dir = f"/home/welton/data/oracle/hits_rate_test/{THRESHOLD}/{dset}/{clf}/{fold}"
            os.makedirs(output_dir, exist_ok=True)
            train_ground_truth = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/train.npz")["y"]
            np.savez(f"{output_dir}/test", y=test_est)
            np.savez(f"{output_dir}/train", y=train_ground_truth)

            ## Comparing this strategy with 
            #y_true = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/test.npz")["y"]
            #prec = np.round(precision_score(y_true, test_est, zero_division=1, pos_label=0) * 100, decimals=2)
            #rec = np.round(recall_score(y_true, test_est, pos_label=0) * 100, decimals=2)
            #print(f"\t\tFOLD: {fold} - Precision: {prec} Recall: {rec}")
            #scores.append([dset, clf, fold, prec, rec])
            


In [None]:
df = pd.DataFrame(scores, columns=["DATASET", "CLF", "Precision", "Recall", "Fold"])
df.to_excel(f"data/{THRESHOLD}.xlsx")


In [None]:
list(np.load("../../data/oracle/upper_bound/webkb/lfr/0/test.npz").keys())

In [None]:
(np.unique(np.load("../../data/oracle/hits_rate/0.1/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.1/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))

In [None]:
(np.unique(np.load("../../data/oracle/hits_rate/0.2/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.2/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))

In [None]:
(np.unique(np.load("../../data/oracle/hits_rate/0.3/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.3/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))