In [8]:
import os
import numpy as np
import pandas as pd
from collections import Counter

# Generate Pandas datasets with CLFs probs.

In [9]:
DATA_SOURCE = "/home/welton/data"
DATASETS = ["webkb", "20ng", "reut", "acm"]
SPLITS = ["split_10"]
N_FOLDS = 10

In [10]:
def set_agreement(df: pd.DataFrame):

    df_preds = df.drop(columns=["bert", "xlnet", "fold_id", "label", "docs"])
    labels = df.label.values
    hits = np.sum(df_preds.values == labels[:, None], axis=1)
    matrix_p = df_preds.values
    pred_list = []
    count_list = []
    for idx in np.arange(df_preds.shape[0]):
        c_pred, counts = Counter(matrix_p[idx]).most_common()[0]
        pred_list.append(c_pred)
        count_list.append(counts)

    df["conc_size"] = count_list
    df["hit_counts"] = hits
    df["conc_pred"] = pred_list

In [11]:
# For each dataset and split setting.
for dataset in DATASETS:
    for sp in SPLITS:
        d_preds = {}
        print(f"[{dataset.upper()} - {sp.upper()}]")
        CLFS_DIR = f"{DATA_SOURCE}/clfs_output/{sp}/{dataset}/{N_FOLDS}_folds/"
        # For each classifier.
        for clf in os.listdir(CLFS_DIR):
            y_pred = []
            # For each fold.
            for fold in np.arange(N_FOLDS):
                test_probas = np.load(f"{CLFS_DIR}/{clf}/{fold}/test.npz")["X_test"]
                y_pred.append(test_probas.argmax(axis=1))
            d_preds[clf] = np.hstack(y_pred)

        # Adding labels and fold's id.
        fold_id = []
        y_true = []
        docs = []
        for fold in np.arange(N_FOLDS):
            # Loading labels.
            labels_path = f"{DATA_SOURCE}/datasets/labels/{sp}/{dataset}/{fold}/test.npy"
            y = np.load(labels_path)
            y_true.append(y)
            # Loading documents.
            docs_path = f"{DATA_SOURCE}/datasets/documents/{sp}/{dataset}/{fold}/test.csv"
            docs.append(pd.read_csv(docs_path, sep=';').docs.values)
            fold_id += y.shape[0] * [fold]

        fold_id = np.hstack(fold_id)
        y_true = np.hstack(y_true)
        docs = np.hstack(docs)

        # Building and Saving the dataframe.
        df = pd.DataFrame(d_preds)
        df["label"] = y_true
        df["fold_id"] = fold_id
        df["docs"] = docs
        
        set_agreement(df)

        output = f"{DATA_SOURCE}/pd_datasets/{dataset}.csv"
        df.to_csv(output, index=False, sep=";")

[WEBKB - SPLIT_10]


In [12]:
df = pd.read_csv(f"{DATA_SOURCE}/pd_datasets/reut.csv", sep=";")

In [13]:
df.columns

Index(['spr', 'kpr', 'xtr', 'xfr', 'stmk', 'ltmk', 'lpr', 'str', 'ltr', 'lfr',
       'kfr', 'xlnet', 'ktmk', 'rep_bert', 'ktr', 'bert', 'sfr', 'xtmk',
       'xlnet_softmax', 'xpr', 'label', 'fold_id', 'docs', 'conc_size',
       'hit_counts', 'conc_pred'],
      dtype='object')

In [14]:
df.shape

(13327, 26)

In [15]:
from sklearn.metrics import f1_score

f1_score(df.rep_bert.values, df.label.values, average="macro")

0.3249819352604704

In [16]:
df.docs

0        \r no intervention dollar fixed at 1 8063 mark...
1        \r indonesian rupiah slips against mark and ye...
2        \r fed s johnson sees easing of inflationary f...
3        \r swiss consumer prices rise one pct in march...
4        \r u s february consumer prices rose 0 4 pct a...
                               ...                        
13322    \r french 1986 current account surplus revised...
13323    \r german feb current account surplus 6 6 bill...
13324    \r s korea current a c surplus seen near 10 bl...
13325    \r swedish current account deficit rises in ja...
13326    \r baker sees 15 to 20 billion dlr drop in tra...
Name: docs, Length: 13327, dtype: object