In [1]:
import json
import numpy as np
import pandas as pd

from collections import Counter
from sklearn.metrics import f1_score

from sys import path

path.append('../utils/')

from utils import get_datasets

from utils import load_preds

from scipy.stats import t as table_t

In [2]:
#DATASETS = ["20ng", "acm", "webkb", "reut"]
DATASETS = ["webkb"]

ALGORITHMS = ["centroide", "gbm", "knn", "lr", "rf", "svm", "bert"]

In [3]:
pd_datasets = get_datasets(DATASETS)

In [None]:
df = pd_datasets["webkb"]

In [None]:
for fold in np.arange(10):
    test = df[df.folds_id == fold]
    print(f1_score(test.meta_layer.values, test.classes.values, average='macro'))

In [None]:
bert_df_path = "/home/welton/data/datasets/pandas/bert/__dset__.csv"

In [None]:
bert_datasets = get_datatasets(DATASETS, bert_df_path, sep=';')

In [None]:
df = bert_datasets["webkb"]

In [None]:
for fold in np.arange(10):
    test = df[df.folds_id == fold]
    print(f1_score(test.bert.values, test.classes.values, average='macro'))

In [None]:
results = {}
alpha = 0.5 / 2
for dset in DATASETS:
    results[dset] = []
    orig_st = pd_datasets[dset]
    bert_st = bert_datasets[dset]
    diffs = []
    orig_values = []
    bert_values = []
    for fold in np.arange(10):
        test_orig = orig_st[orig_st.folds_id == fold]
        test_bert = bert_st[bert_st.folds_id == fold]
        
        v1 = f1_score(test_orig.meta_layer.values, test_orig.classes.values, average='macro')
        v2 = f1_score(test_bert.bert.values, test_bert.classes.values, average='macro')

        diffs.append(v1 - v2)

        orig_values.append(v1)
        bert_values.append(v2)
    
    x1 = np.round(np.mean(orig_values)*100, decimals=1)
    x2 = np.round(np.mean(bert_values)*100, decimals=1)
    results[dset].append([x1, x2])
    
    mean = np.mean(diffs)
    std = np.std(diffs) / np.sqrt(len(diffs))
    
    t = table_t.ppf(1 - alpha, df=9)
    ground = mean - t * std
    ceiling = mean + t * std
    
    print(f"{dset.upper()} - ({ground} - {ceiling})")

In [None]:
pd.DataFrame(results)

In [None]:
table_t.ppf(1 - 0.025, df=9)

In [None]:
from utils import load_preds

In [None]:
np.unique(pd_datasets["reut"].classes.values == bert_datasets["reut"].classes.values)

In [None]:
preds = []
for fold in np.arange(10):
    p = np.load(f"/home/welton/data/stacking/stacking_output/reut/10_folds/logistic_regression/proba/fold_{fold}/y_pred.npy")
    preds.append(p)
preds = np.hstack(preds)

In [None]:
f1_score(pd_datasets["reut"].classes.values, preds, average="macro")

In [None]:
import os
import json
for dset in DATASETS:
    macro_list = []
    for fold in np.arange(10):
        
        jp = f"/home/claudiovaliense/projetos/kaggle/{dset}_bert{fold}_pred.json"
        
        if not os.path.exists(jp):
            jp = f"/home/welton/data/kaggle/{dset}_bert{fold}_pred.json"
        
        with open(jp, "r") as fd:
            macro_list.append(json.load(fd)["Macro-f1"])
    print(f"{dset.upper()} - {np.mean(macro_list)*100}")

In [None]:
macro_list = []
for fold in np.arange(10):
    jp = f"/home/welton/data/stacking/stacking_output/reut/10_folds/logistic_regression/proba/fold_{fold}/scoring.json"
    with open(jp, 'r') as fd:
        macro_list.append(json.load(fd)["f1_macro"])

In [None]:
np.mean(macro_list)

In [None]:
results = {}
alpha = 0.5 / 2
for dset in DATASETS:
    results[dset] = []
    orig_st = pd_datasets[dset]
    bert_st = bert_datasets[dset]
    diffs = []
    orig_values = []
    bert_values = []
    for fold in np.arange(10):
        test_orig = orig_st[orig_st.folds_id == fold]
        test_bert = bert_st[bert_st.folds_id == fold]
        
        v1 = f1_score(test_orig.meta_layer.values, test_orig.classes.values, average='macro')
        v2 = f1_score(test_bert.bert.values, test_bert.classes.values, average='macro')

        diffs.append(v1 - v2)

        orig_values.append(v1)
        bert_values.append(v2)
    
    x1 = np.round(np.mean(orig_values)*100, decimals=1)
    x2 = np.round(np.mean(bert_values)*100, decimals=1)
    results[dset].append([x1, x2])
    
    mean = np.mean(diffs)
    std = np.std(diffs) / np.sqrt(len(diffs))
    
    t = table_t.ppf(1 - alpha, df=9)
    ground = mean - t * std
    ceiling = mean + t * std
    
    print(f"{dset.upper()} - ({ground} - {ceiling})")

In [5]:
fold_values = {}
for dset in DATASETS:
    diffs = []
    alpha = 0.05 / 2
    fold_values[dset] = []
    for fold in np.arange(10):

        ppath = f"/home/welton/data/stacking/stacking_output/{dset}/10_folds/logistic_regression/proba/fold_{fold}/scoring.json"
        with open(ppath, 'r') as fd:
            pf1 = json.load(fd)["f1_macro"]
        
        mpath = f"/home/welton/data/stacking/stacking_output/{dset}/10_folds/logistic_regression/encoder/fold_{fold}/scoring.json"
        with open(mpath, 'r') as fd:
            mf1 = json.load(fd)["f1_macro"]
        
        diffs.append(mf1 - pf1)
        fold_values[dset].append([mf1, pf1])
        
    mean = np.mean(diffs)
    std = np.std(diffs) / np.sqrt(len(diffs))

    t = table_t.ppf(1 - alpha, df=9)
    ground = np.round(mean - t * std, decimals=4)
    ceiling = np.round(mean + t * std, decimals=4)

    print(f"{dset.upper()} - ({ground} - {ceiling})")

WEBKB - (-0.0073 - 0.0148)


In [6]:
for dset in DATASETS:
    df = pd.DataFrame(fold_values[dset], columns=["mreps", "probs"])
    mrep = np.round(np.mean(df.mreps.values) * 100, decimals=2)
    prob = np.round(np.mean(df.probs.values) * 100, decimals=2)
    print(f"{dset.upper()} - Mix Reps: {mrep} Prob: {prob}")

WEBKB - Mix Reps: 84.02 Prob: 83.65
