In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import json
import itertools
import numpy as np
import pandas as pd

from statistics import stdev
from scipy.stats import t as table_t
from scipy.stats import f as table_f

pd.set_option('display.max_colwidth', None)


In [2]:
BASE_DIR = "/home/welton/stacking_text_classification/data/stacking_output"
DATASETS = ["webkb", "acm", "20ng"]
META_LAYERS = ["svm_rbf"]
INPUT_TYPES = [
"num_feats/126/with_proba/True/combination/fwls/centroids",
"num_feats/198/with_proba/True/combination/fwls/centroids",
"num_feats/360/with_proba/True/combination/fwls/centroids",
]

In [3]:
def get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES):
    
    dsets_scores = {}
    results = []
    iterations = itertools.product(DATASETS, META_LAYERS, INPUT_TYPES)

    for (dset, meta_layer, inp_type) in iterations:
        macro_list = []
        micro_list = []
        
        if dset not in dsets_scores:
            dsets_scores[dset] = {}
        
        if inp_type not in dsets_scores[dset]:
            dsets_scores[dset][inp_type] = {}
        
        for fold in range(10):
            json_score = f"{BASE_DIR}/{dset}/10_folds/{meta_layer}/{inp_type}/fold_{fold}/scoring.json"
            if not os.path.exists(json_score):
                json_score = f"{BASE_DIR}/{dset}/10_folds/{meta_layer}/{inp_type}/fold_{fold}/scoring.json"
            if not os.path.exists(json_score):
                continue
            with open(json_score, 'r') as fd:
                scoring = json.load(fd)
            
            dsets_scores[dset][inp_type][fold] = np.around(scoring['f1_macro'] * 100, decimals=2)
            
            macro_list.append(scoring['f1_macro'])
            micro_list.append(scoring['f1_micro'])
        mean_macro = np.around(np.mean(macro_list) * 100, decimals=2)
        std_macro = np.around(np.std(macro_list) * 100, decimals=2)
        mean_micro = np.around(np.mean(micro_list) * 100, decimals=2)
        std_micro = np.around(np.std(micro_list) * 100, decimals=2)
        results.append([dset, meta_layer, inp_type, mean_macro, std_macro, mean_micro, std_micro])
    df = pd.DataFrame(results, columns=["Dataset", "MetaLayer", "InputType", "Macro", "Std Macro", "Micro", "Std Micro"])
    return df, dsets_scores

In [4]:
#print(df.to_string(index=False))
df, dsets_scores = get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES)
df[['Macro', 'Std Macro', 'Micro', 'Std Micro']].dropna().style.format("{:.2f}").hide_index()


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Macro,Std Macro,Micro,Std Micro
82.96,2.08,88.28,1.01
71.9,2.01,80.3,0.52
85.94,0.98,86.25,0.94


In [5]:
df.dropna().head(100, )

Unnamed: 0,Dataset,MetaLayer,InputType,Macro,Std Macro,Micro,Std Micro
0,webkb,svm_rbf,num_feats/126/with_proba/True/combination/fwls/centroids,82.96,2.08,88.28,1.01
4,acm,svm_rbf,num_feats/198/with_proba/True/combination/fwls/centroids,71.9,2.01,80.3,0.52
8,20ng,svm_rbf,num_feats/360/with_proba/True/combination/fwls/centroids,85.94,0.98,86.25,0.94


In [6]:
folds_score = pd.DataFrame(dsets_scores["webkb"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)

proba,num_feats/25/with_proba/True/combination/concat/centroid_l2,num_feats/50/with_proba/True/combination/concat/centroid_l2,num_feats/100/with_proba/True/combination/concat/centroid_l2,num_feats/200/with_proba/True/combination/concat/centroid_l2,num_feats/300/with_proba/True/combination/concat/centroid_l2,num_feats/400/with_proba/True/combination/concat/centroid_l2,num_feats/500/with_proba/True/combination/concat/centroid_l2
85.08,84.63,85.53,85.13,85.37,85.28,,
84.17,80.77,80.96,82.66,82.95,83.46,,
81.43,83.26,82.45,83.7,82.83,82.83,,
77.81,82.3,81.01,81.15,81.99,81.89,,
79.57,79.04,79.72,80.1,80.37,80.37,,
84.88,81.72,82.01,81.62,82.21,82.7,,
84.83,84.72,85.63,86.01,86.11,86.21,,
84.63,86.45,86.02,87.68,86.41,86.23,,
87.68,84.83,84.84,84.73,84.92,85.72,,
86.41,84.03,84.15,85.45,84.14,84.14,,


# Paired test

In [7]:
def paired_test(
    meta_layer="logistic_regression",
    system1="encoder/rep/fast_text/hidden_layers/3",
    system2="dist",
    dset="webkb",
    metric="f1_macro",
    alpha=0.01,
    df=9
):

    t_value = table_t.ppf(1 - alpha/2, df)

    residual = []
    for fold in np.arange(10):

        json_score = f"{BASE_DIR}/{dset}/10_folds/{meta_layer}/{system2}/fold_{fold}/scoring.json"
        if not os.path.exists(json_score):
            return None
            
        with open(json_score, 'r') as fd:
            score_s2 = json.load(fd)[metric]

        json_score = f"{BASE_DIR}/{dset}/10_folds/logistic_regression/{system1}/fold_{fold}/scoring.json"
        with open(json_score, 'r') as fd:
            score_s1 = json.load(fd)[metric]


        residual.append(score_s1 - score_s2)

    std = stdev(residual)
    coef = t_value * (std / np.sqrt(df+1))
    m = np.mean(residual)

    return (np.around(m - coef, decimals=6), np.around(m + coef, decimals=6))

In [8]:
datasets = ["webkb", "acm", "20ng"]
benchmark = ["proba"]
proposals = [
"num_feats/126/with_proba/True/combination/fwls/centroids",
"num_feats/198/with_proba/True/combination/fwls/centroids",
"num_feats/360/with_proba/True/combination/fwls/centroids", 
]

iterations = itertools.product(datasets, benchmark, proposals)
meta_layer = "svm_rbf"

intervs = []
for dset, bench, prop in iterations:
    inter = paired_test(dset=dset, meta_layer=meta_layer, system1=bench, system2=prop)
    if inter is not None:
        print(f"{dset}".upper())
        print(f"\t{bench} | {prop} - {inter}")
        intervs.append([dset, bench, f"{meta_layer}/{prop}", inter])

t_interv = pd.DataFrame(intervs, columns=["Dataset", "Baseline", "Proposal", "Interval"])

WEBKB
	proba | num_feats/126/with_proba/True/combination/fwls/centroids - (-0.020994, 0.034724)
ACM
	proba | num_feats/198/with_proba/True/combination/fwls/centroids - (-0.00016, 0.034987)
20NG
	proba | num_feats/360/with_proba/True/combination/fwls/centroids - (0.04462, 0.068416)


In [9]:
t_interv[t_interv.Proposal.str.contains("svm")]

Unnamed: 0,Dataset,Baseline,Proposal,Interval
0,webkb,proba,svm_rbf/num_feats/126/with_proba/True/combination/fwls/centroids,"(-0.020994, 0.034724)"
1,acm,proba,svm_rbf/num_feats/198/with_proba/True/combination/fwls/centroids,"(-0.00016, 0.034987)"
2,20ng,proba,svm_rbf/num_feats/360/with_proba/True/combination/fwls/centroids,"(0.04462, 0.068416)"


In [10]:
for i in t_interv[t_interv.Proposal.str.contains("svm")].Interval.values:
    print(i)

(-0.020994, 0.034724)
(-0.00016, 0.034987)
(0.04462, 0.068416)


# Two Factor Factorial (2kr)

In [None]:
INPUT_TYPES = [
                "with_proba/False/tfidf/topn/0.3/simple/False",
                "with_proba/False/tfidf/topn/1/simple/False",
                "with_proba/False/chi2/topn/0.3/simple/False",
                "with_proba/False/chi2/topn/1/simple/False"]

DATASETS = ["big_acm", "short_acm"]

In [None]:
df, dsets_scores = get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES)

In [None]:
folds_score = pd.DataFrame(dsets_scores["short_acm"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)

In [None]:
folds_score = pd.DataFrame(dsets_scores["big_acm"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)

In [None]:
def ic(qi, sqi, alpha, df):

    ground = qi - table_t.ppf(1 - alpha/2, df) * sqi
    ceiling = qi + table_t.ppf(1 - alpha/2, df) * sqi
    #print(f"{ground} ; {ceiling}")
    return [ground, ceiling]


def two_kr(scores, alpha=0.05):
    level_table = np.array([[1, -1, -1, 1], [1, 1, -1, -1], [1, -1, 1, -1], [1, 1, 1, 1]])
    y_mean = np.mean(scores.values, axis=0)
    model_params = (np.dot(level_table.T, y_mean) * 1/4)
    values = scores.values
    T_values = values.T
    #np.zeros(values.shape[0], values.shape[1])
    errors = []
    for row in np.arange(level_table.shape[0]):
        y_pred = 0
        for col in np.arange(level_table.shape[1]):
            y_pred += model_params[col] * level_table[row][col]
        errors.append(T_values[row] - y_pred)
    errors = np.array(errors)
    SSE = np.sum(np.power(errors, 2))
    SS0 = 4 * 10 * np.power(model_params[0],2)
    SSY = np.sum(np.power(values, 2))
    SST = SSY - SS0
    SSA = 4 * 10 * np.power(model_params[1],2)
    SSB = 4 * 10 * np.power(model_params[2],2)
    SSAB = 4 * 10 * np.power(model_params[3],2)
    df = 4 * (values.shape[0] - 1)
    Se2 = SSE / (df)
    Se = np.sqrt(Se2)
    Sqi = Se / (np.sqrt(df))

    ica = ic(model_params[1], Sqi, alpha, df)
    icb = ic(model_params[2], Sqi, alpha, df)
    icab = ic(model_params[3], Sqi, alpha, df)

    output = f"""
    Modelo - y = {model_params[0]} + {model_params[1]}*Xa + {model_params[2]}*Xb + {model_params[3]}*XaXb 
    Erro (SSE): {SSE}
    Variação total (SST): {SST}
    SSY: {SSY}
    SS0: {SS0}
    SSA: {SSA} - Explicação: {np.around((SSA / SST)*100, decimals=2)}%
    SSB: {SSB} - Explicação: {np.around((SSB / SST)*100, decimals=2)}%
    SSAB: {SSAB} - Explicação: {np.around((SSAB / SST)*100, decimals=2)}%
    Não explicado (Erro experimental): {np.around((1 - SSA/SST - SSB/SST - SSAB/SST) * 100, decimals=2)}
    Variância do erro (Se²/MSE): {Se2}
    Desvio do erro (Se): {Se}
    Desvio dos parâmetros (Sqi): {Sqi}
    Confidence interval qa: {ica}
    Confidence interval qb: {icb}
    Confidence interval qab: {icab}
    """
    print(output)

    SSE = np.around(SSE, decimals=2)
    SS0 = np.around(SS0, decimals=2)
    SSY = np.around(SSY, decimals=2)
    SST = np.around(SST, decimals=2)
    SSA = np.around(SSA, decimals=2)
    SSB = np.around(SSB, decimals=2)
    SSAB = np.around(SSAB, decimals=2)
    df = np.around(df, decimals=2)
    Se2 = np.around(Se2, decimals=2)
    Se = np.around(Se, decimals=2)
    Sqi = np.around(Sqi, decimals=2)
    ica = [np.around(ica[0], decimals=2), np.around(ica[1], decimals=2)]
    icb = [np.around(icb[0], decimals=2), np.around(icb[1], decimals=2)]    
    icab = [np.around(icab[0], decimals=2), np.around(icab[1], decimals=2)]
    
    report = [["SSE", SSE],
    ["SS0", SS0],
    ["SSY", SSY],
    ["SST", SST],
    ["SSA", SSA],
    ["SSB", SSB],
    ["SSAB", SSAB],
    ["df", df],
    ["Se2", Se2],
    ["Se", Se],
    ["Sqi", Sqi],
    ["qa", ica],
    ["qb", icb],
    ["qab", icab]]

    #df = pd.DataFrame(report, columns=["metric", "value"])
    #df.value = np.around(df.value.astype(float).values, decimals=2)
    return report#df



In [None]:
two_kr(pd.DataFrame(dsets_scores["short_acm"]), alpha=0.05)

In [None]:
two_kr(pd.DataFrame(dsets_scores["big_acm"]), alpha=0.05)

# One Factor

In [None]:
def one_factor(y_true):

    NUM_REPS = y_true.shape[0]
    NUM_LEVELS = y_true.shape[1]
    general_mean = np.mean(y_true.values)
    levels_mean = np.mean(y_true.values, axis=0) - general_mean
    matrix_level_mean = np.repeat(levels_mean, NUM_REPS).reshape(NUM_REPS, -1).T
    matrix_general_mean = np.repeat(general_mean, NUM_LEVELS *NUM_REPS).reshape(NUM_LEVELS, -1)
    err = y_true.values.T - matrix_general_mean - matrix_level_mean
    SSE = np.sum(np.power(err, 2))
    SS0 = NUM_REPS * NUM_LEVELS * np.power(general_mean, 2)
    SSA = NUM_REPS * np.sum(np.power(levels_mean, 2))
    SSY = SS0 + SSA + SSE
    SST = SSY - SS0
    MSA = SSA / (NUM_LEVELS - 1)
    MSE = SSE / (NUM_LEVELS * (NUM_REPS - 1))
    Se2 = MSE
    Saj = np.sqrt((Se2 * (NUM_LEVELS - 1)) / (NUM_LEVELS * NUM_REPS))
    Fcalc = MSA / MSE
    Ftab = table_f.ppf(0.90, NUM_LEVELS - 1, NUM_LEVELS * (NUM_REPS - 1))
    
    out_put = f"""
    general_mean: {general_mean}
    levels_mean: {levels_mean}
    SSE: {SSE}
    SS0: {SS0}
    SSA: {SSA}
    SSY: {SSY}
    SST: {SST}
    MSA: {MSA}
    MSE/Se2: {MSE}
    Se: {np.sqrt(Se2)}
    Saj: {Saj}
    Fcalc: {Fcalc}
    Ftab: {Ftab}
    """
    print(out_put)

    for i in np.arange(levels_mean.shape[0]):
        j = levels_mean[i]
        print(ic(j, Saj, 0.1, NUM_LEVELS-1))

In [None]:
DATASETS = ["big_acm"]
INPUT_TYPES = [
                "with_proba/False/chi2/topn/0.3/simple/False",
                "with_proba/False/chi2/topn/0.5/simple/False",
                "with_proba/False/chi2/topn/0.7/simple/False",
                "with_proba/False/chi2/topn/0.9/simple/False",
                "with_proba/False/chi2/topn/1/simple/False",
                "with_proba/False/tfidf/topn/0.3/simple/False",
                "with_proba/False/tfidf/topn/0.5/simple/False",
                "with_proba/False/tfidf/topn/0.7/simple/False",
                "with_proba/False/tfidf/topn/0.9/simple/False",
                "with_proba/False/tfidf/topn/1/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.3/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.5/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.7/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.9/simple/False",
                "with_proba/False/tfidf-chi2/topn/1/simple/False"]
                
df, dsets_scores = get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES)

In [None]:
folds_score = pd.DataFrame(dsets_scores["big_acm"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)
folds_score

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/chi2') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/tfidf/') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/tfidf-chi2') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
DATASETS = ["short_acm"]
INPUT_TYPES = [
                "with_proba/False/chi2/topn/0.3/simple/False",
                "with_proba/False/chi2/topn/0.5/simple/False",
                "with_proba/False/chi2/topn/0.7/simple/False",
                "with_proba/False/chi2/topn/0.9/simple/False",
                "with_proba/False/chi2/topn/1/simple/False",
                "with_proba/False/tfidf/topn/0.3/simple/False",
                "with_proba/False/tfidf/topn/0.5/simple/False",
                "with_proba/False/tfidf/topn/0.7/simple/False",
                "with_proba/False/tfidf/topn/0.9/simple/False",
                "with_proba/False/tfidf/topn/1/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.3/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.5/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.7/simple/False",
                "with_proba/False/tfidf-chi2/topn/0.9/simple/False",
                "with_proba/False/tfidf-chi2/topn/1/simple/False"]
df, dsets_scores = get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES)

In [None]:
folds_score = pd.DataFrame(dsets_scores["short_acm"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)
folds_score

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/chi2') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/tfidf/') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
y_true = folds_score[[ col for col in folds_score.columns if col.find('with_proba/False/tfidf-chi2') > -1 ]]

In [None]:
one_factor(y_true)

In [None]:
DATASETS = ["short_acm"]
INPUT_TYPES = [
                "with_proba/False/chi2/topn/0.3/simple/False",
                "with_proba/False/chi2/topn/0.5/simple/False",
                "with_proba/False/chi2/topn/0.7/simple/False",
                "with_proba/False/chi2/topn/0.9/simple/False",
                "with_proba/False/chi2/topn/1/simple/False",
                "with_proba/False/tfidf/topn/0.3/simple/False",
                "with_proba/False/tfidf/topn/0.5/simple/False",
                "with_proba/False/tfidf/topn/0.7/simple/False",
                "with_proba/False/tfidf/topn/0.9/simple/False",
                "with_proba/False/tfidf/topn/1/simple/False"]
                
df, dsets_scores = get_scores_dataset(DATASETS, META_LAYERS, INPUT_TYPES)
df

In [None]:
folds_score = pd.DataFrame(dsets_scores["short_acm"])
folds_score.style.format("{:.2f}").hide_index()#.to_csv("temp.csv", sep=";", index=False)
folds_score

In [5]:
import sys
import numpy as np
sys.path.append("/home/welton/stacking_text_classification/scr")

In [6]:
from typing import List, Tuple

def load_x_y(
        file: str,
        test_train: str
) -> Tuple[np.ndarray, np.ndarray]:
    loaded = np.load(file, allow_pickle=True)

    X = loaded[f"X_{test_train}"]
    y = loaded[f"y_{test_train}"]

    if X.size == 1:
        X = X.item()

    return X, y


In [7]:
from sklearn.datasets import load_svmlight_file

In [8]:
load = load_svmlight_file("../../../meta-features/data/embeddings/bert/base/webkb/None/train0.gz")

In [9]:
X = load[0]
y = load[1]

In [10]:
from optimization import execute_optimization

In [11]:
import sys
sys.path.append('../../')
sys.path.append("../../scr/")

In [12]:
import numpy as np
from scr.feature_selection.feature_importance import FeatureSelector

In [13]:
fs = FeatureSelector()

In [14]:
sort = fs.feature_importance("test", X[500:], y[500:], 100)

LOADING PRE SELECTED FEATURES.


In [15]:
m = np.load("../../data/feature_selection/test/feature_ranking.npy")

In [16]:
m.shape

(768,)

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()


In [19]:
rf.fit(X[500:], y[500:], n_jobs=5)

RandomForestClassifier()

In [22]:
s = rf.feature_importances_.argsort()

In [24]:
fs = rf.feature_importances_

In [29]:
fs[s[0]], fs[s[-1]]

(2.00040432403999e-06, 0.028993694885435177)

In [30]:
max(fs[s[0]], fs[s[-1]])

0.028993694885435177

In [31]:
x = np.arange(10)

In [32]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [33]:
(-x).argsort()

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [3]:
import numpy as np
np.load("/home/welton/data/clfs_output/split_10_with_val/webkb/10_folds/svm/0/probas.npy").shape

(823, 7)

In [5]:
from typing import Tuple, List


In [6]:
def read_train_test_bert(
        data_source: str,
        dataset: str,
        algorithms: List[str],
        n_folds: int,
        fold_id: int
) -> Tuple[np.ndarray, np.ndarray]:
    
    Xs_train, Xs_test = [], []

    for clf in algorithms:
        
        probs_dir = f"{data_source}/clfs_output/split_10_with_val/{dataset}/{n_folds}_folds/{clf}/{fold_id}"

        X_train_meta = np.load(f"{probs_dir}/probas.npy")
        X_test_meta = np.load(f"{probs_dir}/probas.npy")

        Xs_train.append(X_train_meta)
        Xs_test.append(X_test_meta)

    X_train_meta = np.hstack(Xs_train)
    X_test_meta = np.hstack(Xs_test)

    return X_train_meta, X_test_meta

In [10]:
X1, X2 = read_train_test_bert(
    "/home/welton/data",
    "20ng",
    ["svm", "gbm"],
    10,
    0
)

In [11]:
X1.shape

(1892, 40)

In [12]:
X2.shape

(1892, 40)