In [3]:
import json
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

from sys import path

path.append("../utils/")

from utils import get_datasets, IDS_MODELS


In [4]:
DATASETS = ["webkb", "20ng", "acm", "reut"]

CLFS_OS = ["ktmk", "bert", "xlnet"]
CLFS_OS = ["kfr", "kpr", "ktmk", "ktr", "lfr", "lpr", "ltmk", "ltr", "sfr", "spr", "stmk", "str", "xfr", "xpr", "xtmk", "xtr"]

CLS_OUTPUT_DIR = "/home/welton/data/clfs_output/split_10"

In [5]:
MODEL_DESC = { IDS_MODELS[k]: k for k in IDS_MODELS }

In [6]:
pd_datasets = get_datasets(DATASETS, "/home/welton/data/pd_datasets/__dset__.csv", sep=';')

In [7]:
d_scores = {}
for dset in DATASETS:
    d_scores[dset] = []
    df = pd_datasets[dset]
    for clf in CLFS_OS:
        m = f1_score(df.label.values, df[clf].values, average="macro")
        d_scores[dset].append(np.round(m * 100, decimals=2))

In [8]:
macro_table = pd.DataFrame(d_scores)
macro_table["CLF"] = CLFS_OS
macro_table["CLF_DESC"] = [ MODEL_DESC[k] for k in CLFS_OS ]

In [9]:
macro_table.sort_values(by=["20ng"], ascending=False)

Unnamed: 0,webkb,20ng,acm,reut,CLF,CLF_DESC
6,66.27,90.58,69.27,40.66,ltmk,lr/tf_idf_1/meta_features_1/knn_cos
10,65.39,90.32,68.78,39.33,stmk,linear_svm/tf_idf_1/meta_features_1/knn_cos
2,64.81,90.27,65.34,30.82,ktmk,knn/tf_idf_1/meta_features_1/knn_cos
14,69.49,89.9,69.39,40.7,xtmk,xgboost/tf_idf_1/meta_features_1/knn_cos
11,71.59,89.21,67.08,32.36,str,linear_svm/tf_idf_1/fs
5,58.87,88.7,63.44,28.54,lpr,lr/pte_1/raw_folds
7,74.79,88.66,68.55,31.43,ltr,lr/tf_idf_1/fs
9,58.35,88.35,62.76,29.79,spr,linear_svm/pte_1/raw_folds
13,66.34,85.9,61.93,21.68,xpr,xgboost/pte_1/raw_folds
3,58.83,84.06,59.37,29.41,ktr,knn/tf_idf_1/fs


In [15]:
macro_table[macro_table.CLF.str.contains('k')]

Unnamed: 0,webkb,20ng,acm,reut,CLF,CLF_DESC
0,64.81,90.27,65.34,30.82,ktmk,knn/tf_idf_1/meta_features_1/knn_cos


In [17]:
from sklearn.metrics import f1_score

In [22]:
SOURCE = "/home/welton/data"
SOURE_PROBAS = f"{SOURCE}/normal_probas/split_10"
SOURCE_LABELS = f"{SOURCE}/datasets/labels/split_10/20ng"
for clf in ["xlnet", "bert"]:
    clf_dir = f"{SOURE_PROBAS}/20ng/10_folds/{clf}"
    for fold in np.arange(10):
        proba_path = f"{clf_dir}/{fold}/test.npz"
        loader = np.load(proba_path)
        y_true = np.load(f"{SOURCE_LABELS}/{fold}/test.npy")
        print(clf, f1_score(y_true, loader["X_test"].argmax(axis=1), average="macro"))
    print()

xlnet 0.8775538536951961
xlnet 0.856963057973722
xlnet 0.8832071217737039
xlnet 0.8674840166254698
xlnet 0.8716155828919359
xlnet 0.8683581464532988
xlnet 0.8700303916805071
xlnet 0.872108048520924
xlnet 0.8693566980883063
xlnet 0.8684728856277486

bert 0.8775538536951961
bert 0.856963057973722
bert 0.8832071217737039
bert 0.8674840166254698
bert 0.8716155828919359
bert 0.8683581464532988
bert 0.8700303916805071
bert 0.872108048520924
bert 0.8693566980883063
bert 0.8684728856277486



In [23]:
SOURCE = "/home/welton/data"
SOURE_PROBAS = f"{SOURCE}/normal_probas/split_10"
SOURCE_LABELS = f"{SOURCE}/datasets/labels/split_10/acm"
for clf in ["xlnet", "bert"]:
    clf_dir = f"{SOURE_PROBAS}/acm/10_folds/{clf}"
    for fold in np.arange(10):
        proba_path = f"{clf_dir}/{fold}/test.npz"
        loader = np.load(proba_path)
        y_true = np.load(f"{SOURCE_LABELS}/{fold}/test.npy")
        print(clf, f1_score(y_true, loader["X_test"].argmax(axis=1), average="macro"))
    print()

xlnet 0.7215781862065933
xlnet 0.7237741970278982
xlnet 0.6929597309310889
xlnet 0.6986625310178657
xlnet 0.6563211983856726
xlnet 0.7039907779761911
xlnet 0.7154685415237751
xlnet 0.6790802023700998
xlnet 0.7139412143249896
xlnet 0.7295789797477954

bert 0.7215781862065933
bert 0.7237741970278982
bert 0.6929597309310889
bert 0.6986625310178657
bert 0.6563211983856726
bert 0.7039907779761911
bert 0.7154685415237751
bert 0.6790802023700998
bert 0.7139412143249896
bert 0.7295789797477954

