In [22]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, precision_score, recall_score

from sys import path

path.append("../analysis/utils/")

from utils import get_datasets

In [23]:
DATASETS = ["webkb", "20ng", "acm", "reut"]

CLFS = ["rep_bert"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

THRESHOLD = 0.1

In [24]:
pd_datasets = get_datasets(DATASETS, path="../../data/pd_datasets/__dset__.csv", sep=';')

In [25]:
def predict(X, THRESHOLD):
    
    estimates = []
    predictions = X.argmax(axis=1)
    # For each prediction.
    for idx, predicted_class in enumerate(predictions):
        probability = X[idx][predicted_class]
        if probability < THRESHOLD:
            estimates.append(0)
        else:
            estimates.append(1)
    return np.array(estimates)

In [26]:
scores = []
for dset in DATASETS:
    print(f"{dset.upper()}")
    for clf in CLFS:
        print(f"\t{clf.upper()}")
        for fold in np.arange(10):
            probs_dir = f"/home/welton/data/clfs_output/split_10/{dset}/10_folds/{clf}/{fold}"
            # Loading probabilities.
            X_train = np.load(f"{probs_dir}/train.npz")["X_train"]
            labels_dir = f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}"
            train_labels = np.load(f"{labels_dir}/train.npy")
            
            X_test = np.load(f"{probs_dir}/test.npz")["X_test"]
            test_est = predict(X_test, THRESHOLD)
            
            # Saving probabilities to confidence.
            output_dir = f"/home/welton/data/oracle/confidence/{THRESHOLD}/{dset}/{clf}/{fold}"
            os.makedirs(output_dir, exist_ok=True)
            y = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/train.npz")["y"]
            normal = np.zeros(y.shape[0]) + 1
            np.savez(f"{output_dir}/test", y=test_est)
            np.savez(f"{output_dir}/train", y=normal)

            ## Comparing this strategy with 
            #y_true = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/test.npz")["y"]
            #prec = np.round(precision_score(y_true, test_est, zero_division=1, pos_label=0) * 100, decimals=2)
            #rec = np.round(recall_score(y_true, test_est, pos_label=0) * 100, decimals=2)
            #print(f"\t\tFOLD: {fold} - Precision: {prec} Recall: {rec}")
            #scores.append([dset, clf, fold, prec, rec])
            


WEBKB
	KPR
	KTR
	LPR
	LTR
	SFR
	STMK
	XFR
	XPR
	XTR
	KFR
	KTMK
	LFR
	LTMK
	SPR
	STR
	XLNET_SOFTMAX
	XTMK
	REP_BERT
20NG
	KPR
	KTR
	LPR
	LTR
	SFR
	STMK
	XFR
	XPR
	XTR
	KFR
	KTMK
	LFR
	LTMK
	SPR
	STR
	XLNET_SOFTMAX
	XTMK
	REP_BERT
ACM
	KPR
	KTR
	LPR
	LTR
	SFR
	STMK
	XFR
	XPR
	XTR
	KFR
	KTMK
	LFR
	LTMK
	SPR
	STR
	XLNET_SOFTMAX
	XTMK
	REP_BERT
REUT
	KPR
	KTR
	LPR
	LTR
	SFR
	STMK
	XFR
	XPR
	XTR
	KFR
	KTMK
	LFR
	LTMK
	SPR
	STR
	XLNET_SOFTMAX
	XTMK
	REP_BERT


In [27]:
df = pd.DataFrame(scores, columns=["DATASET", "CLF", "Precision", "Recall", "Fold"])
df.to_excel(f"data/{THRESHOLD}.xlsx")


In [28]:
list(np.load("../../data/oracle/upper_bound/webkb/lfr/0/test.npz").keys())

['y']

In [29]:
(np.unique(np.load("../../data/oracle/hits_rate/0.1/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.1/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))

((array([0, 1]), array([   3, 1889])), (array([0, 1]), array([   34, 16920])))

In [30]:
(np.unique(np.load("../../data/oracle/hits_rate/0.2/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.2/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))

((array([0, 1]), array([  63, 1829])), (array([0, 1]), array([  701, 16253])))

In [31]:
(np.unique(np.load("../../data/oracle/hits_rate/0.3/20ng/xlnet_softmax/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_rate/0.3/20ng/xlnet_softmax/0/train.npz")['y'], return_counts=True))

((array([0, 1]), array([ 196, 1696])), (array([0, 1]), array([ 1904, 15050])))