In [1]:
import os
import numpy as np
import pandas as pd

from sys import path
path.append("../analysis/utils/")

from utils import get_datasets

In [5]:
DATASETS = ["webkb", "20ng", "acm"]

CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

DATA_SOURCE = "/home/welton/data"

ORACLE_DIR = f"{DATA_SOURCE}/oracle/upper_minus_bert_xlnet"

In [6]:
pd_datasets = get_datasets(DATASETS, path=f"{DATA_SOURCE}/pd_datasets/__dset__.csv", sep=';')

In [7]:
# For each dataset.
for dset in DATASETS:
    df = pd_datasets[dset]
    try:
        preds = df.drop(columns=["label", "fold_id", "rep_bert_bkp", "bert"])
    except:
        preds = df.drop(columns=["label", "fold_id", "bert"])
    # For each classifier.
    for clf in CLFS:
        # For each fold.
        for fold in np.arange(10):

            train_load = np.load(f"/home/welton/data/clfs_output/split_10/{dset}/10_folds/{clf}/{fold}/train.npz")
            train_preds = train_load["X_train"].argmax(axis=1)
            train_labels = np.load(f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}/train.npy")
            
            test_load = np.load(f"/home/welton/data/clfs_output/split_10/{dset}/10_folds/{clf}/{fold}/test.npz")
            test_preds = test_load["X_test"].argmax(axis=1)
            test_labels = np.load(f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}/test.npy")

            # Creating dir.
            output_dir = f"{ORACLE_DIR}/{dset}/{clf}/{fold}/"
            os.makedirs(output_dir, exist_ok=True)
            
            # Making train/test upper bound.
            train_upper = np.zeros(train_preds.shape[0])
            test_upper = np.zeros(test_preds.shape[0])

            if clf in ['xlnet_softmax', 'rep_bert']:
                train_upper = train_upper + 1
                test_upper = test_upper + 1
            else:
                # Set Hit/True (value 1) on documents right classifieds.
                train_upper[train_preds == train_labels] = 1
                test_upper[test_preds == test_labels] = 1
                
            # Save the upper_ground.
            output = f"{output_dir}/train"
            np.savez(output, y=train_upper)
            output = f"{output_dir}/test"
            np.savez(output, y=test_upper)

In [10]:
[np.unique(np.load("/home/welton/data/oracle/upper_minus_bert_xlnet/webkb/xlnet_softmax/0/test.npz")['y']),
np.unique(np.load("/home/welton/data/oracle/upper_minus_bert_xlnet/webkb/xlnet_softmax/0/train.npz")['y'])]

[array([1.]), array([1.])]

In [11]:
[np.unique(np.load("/home/welton/data/oracle/upper_minus_bert_xlnet/webkb/lfr/0/test.npz")['y']),
np.unique(np.load("/home/welton/data/oracle/upper_minus_bert_xlnet/webkb/lfr/0/train.npz")['y'])]

[array([0., 1.]), array([0., 1.])]

In [None]:
ct = np.load("/home/welton/data/oracle/upper_test/webkb/rep_bert/0/train.npz")["y"]
ct2 = np.load("/home/welton/data/oracle/upper_test/webkb/ltr/0/train.npz")["y"]
np.unique(ct2 == ct)

array([ True])

In [None]:
y = np.load("/home/welton/data/datasets/labels/split_10/webkb/1/train.npy")
yc = np.load("/home/welton/data/clfs_output/split_10/webkb/10_folds/ltr/1/train.npz")['y_train']
np.unique(y == yc)

array([ True])

In [None]:
np.multiply([[1,0,1], [0,0,0], [1,1,1]], [[2,2,2],[2,2,2],[2,2,2]])

array([[2, 0, 2],
       [0, 0, 0],
       [2, 2, 2]])

In [None]:
np.unique(np.load("/home/welton/data/oracle/upper_train/webkb/kfr/0/test.npz")['y'], return_counts=True)

(array([1.]), array([823]))

In [None]:
np.unique(np.load("/home/welton/data/oracle/upper_train/webkb/kfr/0/train.npz")['y'], return_counts=True)

(array([0., 1.]), array([1682, 5694]))