In [1]:
import os
import numpy as np
import pandas as pd
from itertools import product

# Generate Pandas datasets with CLFs probs.

In [2]:
DATA_SOURCE = "/home/welton/data"
DATASETS = ["webkb", "20ng", "reut", "acm"]
SPLITS = ["split_10"]
N_FOLDS = 10

In [3]:
# For each dataset and split setting.
for dataset in DATASETS:
    for sp in SPLITS:
        d_preds = {}
        print(f"[{dataset.upper()} - {sp.upper()}]")
        CLFS_DIR = f"{DATA_SOURCE}/clfs_output/{sp}/{dataset}/{N_FOLDS}_folds/"
        # For each classifier.
        for clf in os.listdir(CLFS_DIR):
            y_pred = []
            # For each fold.
            for fold in np.arange(N_FOLDS):
                test_probas = np.load(f"{CLFS_DIR}/{clf}/{fold}/test.npz")["X_test"]
                y_pred.append(test_probas.argmax(axis=1))
            d_preds[clf] = np.hstack(y_pred)

        # Adding labels and fold's id.
        fold_id = []
        y_true = []
        for fold in np.arange(N_FOLDS):
            labels_path = f"{DATA_SOURCE}/datasets/labels/{sp}/{dataset}/{fold}/test.npy"
            y = np.load(labels_path)
            y_true.append(y)
            fold_id += y.shape[0] * [fold]
        fold_id = np.hstack(fold_id)
        y_true = np.hstack(y_true)
        
        # Building and Saving the dataframe.
        df = pd.DataFrame(d_preds)
        df["label"] = y_true
        df["fold_id"] = fold_id
        
        output = f"{DATA_SOURCE}/pd_datasets/{dataset}.csv"
        df.to_csv(output, index=False, sep=";")

[WEBKB - SPLIT_10]
[20NG - SPLIT_10]
[REUT - SPLIT_10]
[ACM - SPLIT_10]


In [4]:
df = pd.read_csv(f"{DATA_SOURCE}/pd_datasets/reut.csv", sep=";")

In [5]:
df.columns

Index(['spr', 'kpr', 'xtr', 'xfr', 'stmk', 'ltmk', 'lpr', 'str', 'ltr', 'lfr',
       'kfr', 'xlnet', 'ktmk', 'rep_bert', 'ktr', 'bert', 'sfr', 'xtmk',
       'xlnet_softmax', 'xpr', 'label', 'fold_id'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,spr,kpr,xtr,xfr,stmk,ltmk,lpr,str,ltr,lfr,...,ktmk,rep_bert,ktr,bert,sfr,xtmk,xlnet_softmax,xpr,label,fold_id
0,16,16,16,16,44,44,16,16,16,16,...,44,16,16,16,16,44,16,16,2,0
1,16,16,16,16,44,44,16,16,16,16,...,44,16,2,16,16,44,16,16,2,0
2,24,16,24,24,24,24,16,24,24,24,...,16,3,16,24,24,24,24,16,3,0
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,0
4,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,0


In [7]:
from sklearn.metrics import f1_score

f1_score(df.bert.values, df.label.values, average="macro")

0.293016407606806