In [4]:
import os
import numpy as np
import pandas as pd

from sys import path
path.append("../../analysis/utils/")

from utils import get_datasets

In [5]:
DATA_SOURCE = "/home/welton/data"
ORACLE_DIR = f"{DATA_SOURCE}/oracle/upper_bound"
LABELS_DIR = "/home/welton/data/datasets/labels"
DATASETS = ["20ng", "acm"]

CLFS = [
    ["bert", "normal_probas"],
    ["xlnet", "normal_probas"],
    ["ktmk", "normal_probas"],
    ["ktr", "normal_probas"],
    ["lstmk", "normal_probas"],
    ["lstr", "normal_probas"],
    ["ltr", "normal_probas"]

]


N_FOLDS = 10

In [6]:
# For each dataset.
for dataset in DATASETS:
    # For each classifier.
    for clf_name, proba_type in CLFS:
        # For each fold.
        for fold in np.arange(10):

            probas_dir = f"{DATA_SOURCE}/{proba_type}/split_{N_FOLDS}/{dataset}/{N_FOLDS}_folds/{clf_name}/{fold}"

            train_probas = np.load(f"{probas_dir}/train.npz")["X_train"]
            test_probas = np.load(f"{probas_dir}/test.npz")["X_test"]
            
            train_labels = np.load(f"{LABELS_DIR}/split_{N_FOLDS}/{dataset}/{fold}/train.npy")
            test_labels = np.load(f"{LABELS_DIR}/split_{N_FOLDS}/{dataset}/{fold}/test.npy")
            
            train_preds = train_probas.argmax(axis=1)
            test_preds = test_probas.argmax(axis=1)

            output_dir = f"{ORACLE_DIR}/{proba_type}/{dataset}/{N_FOLDS}_folds/{clf_name}/{fold}/"
            os.makedirs(output_dir, exist_ok=True)
            
            # Making train upper bound.
            train_upper = np.zeros(train_preds.shape[0])
            # Set Hit/True (value 1) on documents right classifieds.
            train_upper[train_preds == train_labels] = 1
            # Save the upper_ground.
            np.savez(f"{output_dir}/train", y=train_upper)

            # Making test upper bound.
            test_upper = np.zeros(test_preds.shape[0])
            # Set Hit/True (value 1) on documents right classifieds.
            test_upper[test_preds == test_labels] = 1
            # Save the upper_ground.
            np.savez(f"{output_dir}/test", y=test_upper)

            """
            # Generating upper test and train test.
            output_dir = f"{ORACLE_DIR.replace('bound', 'train')}/{dataset}/{clf}/{fold}/"
            os.makedirs(output_dir, exist_ok=True)
            output = f"{output_dir}/train"
            np.savez(output, y=train_upper)
            output = f"{output_dir}/test"
            fake_test = np.zeros(test_upper.shape[0]) + 1
            np.savez(output, y=fake_test)

            output_dir = f"{ORACLE_DIR.replace('bound', 'test')}/{dataset}/{clf}/{fold}/"
            os.makedirs(output_dir, exist_ok=True)
            output = f"{output_dir}/train"
            fake_train = np.zeros(train_upper.shape[0]) + 1
            np.savez(output, y=fake_train)
            output = f"{output_dir}/test"
            np.savez(output, y=test_upper)
            """            
            

In [9]:
ct = np.load("/home/welton/data/oracle/upper_test/webkb/rep_bert/0/train.npz")["y"]
ct2 = np.load("/home/welton/data/oracle/upper_test/webkb/ltr/0/train.npz")["y"]
np.unique(ct2 == ct)

array([ True])

In [10]:
y = np.load("/home/welton/data/datasets/labels/split_10/webkb/1/train.npy")
yc = np.load("/home/welton/data/clfs_output/split_10/webkb/10_folds/ltr/1/train.npz")['y_train']
np.unique(y == yc)

array([ True])

In [11]:
np.multiply([[1,0,1], [0,0,0], [1,1,1]], [[2,2,2],[2,2,2],[2,2,2]])

array([[2, 0, 2],
       [0, 0, 0],
       [2, 2, 2]])

In [12]:
np.unique(np.load("/home/welton/data/oracle/upper_train/webkb/kfr/0/test.npz")['y'], return_counts=True)

(array([1.]), array([823]))

In [13]:
np.unique(np.load("/home/welton/data/oracle/upper_train/webkb/kfr/0/train.npz")['y'], return_counts=True)

(array([0., 1.]), array([1682, 5694]))