In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score

from sys import path

path.append("../analysis/utils/")

from utils import get_datasets

In [3]:
DATASETS = ["webkb"]

CLFS = ["rep_bert"]
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

THRESHOLD = [0.1, 0.3, 0.5]

In [4]:
pd_datasets = get_datasets(DATASETS, path="../../data/pd_datasets/__dset__.csv", sep=';')

In [5]:
estimators = {
    "webkb": {
        "kpr" : { 0.2: True, 0.3: True, 0.4: True },
        "ktr": {},
        "lpr": { 0.2: True, 0.3: True, 0.4: True, 0.5: True, 0.6: True, 0.7: True},
        "ltr": { 0.1: True, 0.2: True, 0.3: True, 0.4: True, 0.5: True },
        "sfr": { 0.1: True, 0.2: True, 0.3: True },
        "stmk": { 0.1: True },
        "xfr": {},
        "xpr": { 0.1: True, 0.2: True, 0.4: True, 0.6: True},
        "xtr": { 0.1: True, 0.2: True, 0.3: True, 0.4: True, 0.5: True },
        "kfr": { 0.1: True, 0.2: True, 0.3: True, 0.4: True, 0.5: True, 0.5: True, 0.6: True, 0.7: True, 0.8: True },
        "ktmk": { 0.3: True, 0.5: True},
        "lfr": { 0.1: True, 0.2: True, 0.3: True, 0.4: True, 0.5: True, 0.5: True, 0.6: True},
        "ltmk": { 0.3: True, 0.4: True, 0.5: True },
        "spr": { 0.1: True, 0.2: True, 0.3: True },
        "str": { 0.1: True, 0.2: True },
        "xlnet_softmax": { 0.1: True, 0.2: True, 0.3: True, 0.4: True, 0.5: True, 0.5: True },
        "xtmk": { 0.2: True, 0.3: True, 0.5: True, 0.6: True,},
        "rep_bert": { 0.1: True, 0.2: True, 0.3: True }
    }
}

In [6]:
def predict(X, estimator, df):
    
    conc_pred = df.conc_pred.values
    conc_size = df.conc_size.values
    estimates = []
    predictions = X.argmax(axis=1)
    # For each prediction.
    for idx, predicted_class in enumerate(predictions):
        probability = X[idx][predicted_class] * 10
        bean = np.trunc(probability) / 10
        bean = 0.9 if bean >= 1 else bean
        # If this confidence has a miss rate greater than THRESHOLD (wether it is in the dictionary or not)
        if bean in estimator or ( bean < 0.7 and predicted_class != conc_pred[idx] and conc_size[idx] > 1):
            estimates.append(0)
        else:
            estimates.append(1)
    return np.array(estimates)

In [7]:
pd_datasets["webkb"].columns

Index(['spr', 'kpr', 'xtr', 'xfr', 'stmk', 'ltmk', 'lpr', 'str', 'ltr', 'lfr',
       'kfr', 'xlnet', 'ktmk', 'rep_bert', 'ktr', 'bert', 'sfr', 'xtmk',
       'xlnet_softmax', 'xpr', 'label', 'fold_id', 'docs', 'conc_size',
       'hit_counts', 'conc_pred'],
      dtype='object')

In [8]:

scores = []
for dset in DATASETS:
    #print(f"{dset.upper()}")
    for clf in CLFS:
        #print(f"\t{clf.upper()}")
        for fold in np.arange(10):
            probs_dir = f"/home/welton/data/clfs_output/split_10/{dset}/10_folds/{clf}/{fold}"
            # Loading probabilities.
            X_train = np.load(f"{probs_dir}/train.npz")["X_train"]
            labels_dir = f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}"
            train_labels = np.load(f"{labels_dir}/train.npy")
            
            X_test = np.load(f"{probs_dir}/test.npz")["X_test"]
            df = pd_datasets[dset]

            # Applying estimator on train and test.
            estimator = estimators[dset]
            test_est = predict(X_test, estimator, df[df.fold_id == fold])
            
            # Saving probabilities to hits_agreement test.
            output_dir = f"/home/welton/data/oracle/hits_agreement/{dset}/{clf}/{fold}"
            os.makedirs(output_dir, exist_ok=True)
            upper_y_train = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/train.npz")["y"]
            normal = np.zeros(upper_y_train.shape[0]) + 1
            np.savez(f"{output_dir}/test", y=test_est)
            np.savez(f"{output_dir}/train", y=normal)

            # Comparing this strategy with 
            upper_y_test = np.load(f"/home/welton/data/oracle/upper_bound/{dset}/{clf}/{fold}/test.npz")["y"]
            prec = np.round(precision_score(upper_y_test, test_est, zero_division=1, pos_label=0) * 100, decimals=2)
            rec = np.round(recall_score(upper_y_test, test_est, pos_label=0) * 100, decimals=2)
            macro = np.round(f1_score(upper_y_test, test_est, pos_label=0) * 100, decimals=2)
            #print(f"\t\tFOLD: {fold} - Precision: {prec} Recall: {rec}")
            scores.append([dset, clf, prec, rec, macro, fold])
            df = pd.DataFrame(scores, columns=["DATASET", "CLF", "Precision", "Recall", "Macro", "Fold"])
            df.to_excel(f"data/hits_agree.xlsx", index=False)

In [9]:
list(np.load("../../data/oracle/hits_agreement/webkb/lfr/0/test.npz").keys())

['y']

In [10]:
(np.unique(np.load("../../data/oracle/hits_agreement/webkb/kpr/0/test.npz")['y'], return_counts=True),
np.unique(np.load("../../data/oracle/hits_agreement/webkb/kpr/0/train.npz")['y'], return_counts=True))

((array([0, 1]), array([125, 698])), (array([1.]), array([7376])))