In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

from src.models.models import get_classifier
from src.models.optimization import execute_optimization

from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

import warnings
from optuna.exceptions import ExperimentalWarning
from optuna.logging import set_verbosity, WARNING
set_verbosity(WARNING)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clf, _ = get_classifier("logistic_regression")

In [3]:
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 200,
 'multi_class': 'auto',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'sag',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [4]:
clf.set_params(**{'C': 0.7, 'dual': True})

In [5]:
clf.get_params()

{'C': 0.7,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 200,
 'multi_class': 'auto',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'sag',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [6]:
estimator_name = "knn"
calib_method = "isotonic"
clf_n_jobs = 20
opt_n_jobs = 3

In [7]:
from joblib import load

In [7]:
model = load("/home/welton/data/stacking/stacking_output/20ng/10_folds/logistic_regression/dist/fold_0/model.joblib")

In [8]:
model.best_estimator_.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler(with_mean=False)),
  ('classifier',
   LogisticRegression(C=11.279016534699569, n_jobs=1, random_state=42,
                      solver='sag'))],
 'verbose': False,
 'scaler': StandardScaler(with_mean=False),
 'classifier': LogisticRegression(C=11.279016534699569, n_jobs=1, random_state=42,
                    solver='sag'),
 'scaler__copy': True,
 'scaler__with_mean': False,
 'scaler__with_std': True,
 'classifier__C': 11.279016534699569,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'auto',
 'classifier__n_jobs': 1,
 'classifier__penalty': 'l2',
 'classifier__random_state': 42,
 'classifier__solver': 'sag',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [5]:
scores = []
for fold in np.arange(10):
    
    train_load = np.load(f"/home/welton/data/representations/webkb/10_folds/fr/{fold}/train.npz", allow_pickle=True)
    test_load = np.load(f"/home/welton/data/representations/webkb/10_folds/fr/{fold}/test.npz", allow_pickle=True)

    X_train, y_train = train_load["X_train"].tolist().toarray(), train_load["y_train"]
    X_test, y_test = test_load["X_test"].tolist().toarray(), test_load["y_test"]

    output_dir = f"data/{estimator_name}"
    os.makedirs(output_dir, exist_ok=True)
    optuna_search = execute_optimization(estimator_name, output_dir, X_train, y_train, clf_n_jobs=clf_n_jobs, opt_n_jobs=opt_n_jobs)

    preds = optuna_search.best_estimator_.predict(X_test)
    opt = f1_score(y_test, preds, average="macro")

    probas = np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/kfr/{fold}/test.npz")["X_test"]
    chr = f1_score(y_test, probas.argmax(axis=1), average="macro")

    X_train_sub, X_eval, y_train_sub, y_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
    
    output_calib_dir = f"data/{estimator_name}_calib/{calib_method}"
    os.makedirs(output_calib_dir, exist_ok=True)
    optuna_search_calib = execute_optimization(estimator_name, output_calib_dir, X_train_sub, y_train_sub, clf_n_jobs=clf_n_jobs, opt_n_jobs=opt_n_jobs)

    calibrated = CalibratedClassifierCV(optuna_search_calib.best_estimator_, method=calib_method, cv="prefit", n_jobs=10)
    calibrated.fit(X_eval, y_eval)
    calib_preds = calibrated.predict(X_test)
    opt_calib = f1_score(y_test, calib_preds, average="macro")
    
    clf, _ = get_classifier(estimator_name, n_jobs=10)
    clf.fit(X_train_sub, y_train_sub)

    just_calibrated = CalibratedClassifierCV(clf, method=calib_method, cv="prefit", n_jobs=10)
    just_calibrated.fit(X_eval, y_eval)
    just_calib_preds = just_calibrated.predict(X_test)
    just_calib = f1_score(y_test, just_calib_preds, average="macro")

    print(f"FOLD: {fold} - Opt: {opt}, Christian: {chr}, Opt and Calib: {opt_calib}, Just Calib: {just_calib}")
    scores.append([fold, opt, chr, opt_calib, just_calib])

	Executing model...
	Executing model...
FOLD: 0 - Opt: 0.7445800425975998, Christian: 0.7351208512160738, Opt and Calib: 0.7386552971041764, Just Calib: 0.7556071200950986
	Executing model...
	Executing model...
FOLD: 1 - Opt: 0.7005408474723874, Christian: 0.6964692825788283, Opt and Calib: 0.6891532361958452, Just Calib: 0.6686931326103401
	Executing model...
	Executing model...
FOLD: 2 - Opt: 0.6449280467505575, Christian: 0.6524161976397361, Opt and Calib: 0.6445177850106546, Just Calib: 0.6564007616127377
	Executing model...
	Executing model...
FOLD: 3 - Opt: 0.6568504450761238, Christian: 0.6562867065228761, Opt and Calib: 0.6630046987468281, Just Calib: 0.6554586321109649
	Executing model...
	Executing model...
FOLD: 4 - Opt: 0.6465813632323459, Christian: 0.6591053830406729, Opt and Calib: 0.6605779481719339, Just Calib: 0.6580250942911771
	Executing model...
	Executing model...
FOLD: 5 - Opt: 0.6461622164945654, Christian: 0.6665335891871764, Opt and Calib: 0.6759547243080662,

In [6]:
np.mean(scores, axis=0)

array([4.5       , 0.68901273, 0.69165754, 0.69416565, 0.69034487])

In [7]:
m = np.array(scores)
np.save(f"data/calib_depure/{estimator_name}", m)

In [8]:
m

array([[0.        , 0.74458004, 0.73512085, 0.7386553 , 0.75560712],
       [1.        , 0.70054085, 0.69646928, 0.68915324, 0.66869313],
       [2.        , 0.64492805, 0.6524162 , 0.64451779, 0.65640076],
       [3.        , 0.65685045, 0.65628671, 0.6630047 , 0.65545863],
       [4.        , 0.64658136, 0.65910538, 0.66057795, 0.65802509],
       [5.        , 0.64616222, 0.66653359, 0.67595472, 0.67778121],
       [6.        , 0.6801133 , 0.66487197, 0.69150428, 0.65374872],
       [7.        , 0.70667416, 0.7085282 , 0.69420828, 0.70249144],
       [8.        , 0.73980919, 0.75675064, 0.76383836, 0.75136847],
       [9.        , 0.72388771, 0.72049259, 0.72024188, 0.72387412]])

In [9]:
PARTIAL_STACKING = ["kfr", "kpr", "ktmk", "ktr", "lfr", "lpr", "ltmk", "rep_bert", "xlnet_rep"]

In [17]:
data = []
for clf in PARTIAL_STACKING:
    scores = []
    for fold in np.arange(10):
        y = np.load(f"/home/welton/data/datasets/labels/split_10/webkb/{fold}/test.npy")
        loader = np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/{clf}/{fold}/test.npz")
        preds = loader["X_test"].argmax(axis=1)
        scores.append(f1_score(y, preds, average="macro"))
    data.append([clf, np.mean(scores)])

        

In [24]:
data_calib = []
for clf in PARTIAL_STACKING:
    scores = []
    for fold in np.arange(10):
        y = np.load(f"/home/welton/data/datasets/labels/split_10/webkb/{fold}/test.npy")
        loader = np.load(f"/home/welton/data/calibrated_probabilities/split_10/webkb/10_folds/{clf}/{fold}/test.npz")
        preds = loader["X_test"].argmax(axis=1)
        scores.append(f1_score(y, preds, average="macro"))
    data_calib.append(np.mean(scores))

In [25]:
df = pd.DataFrame(data, columns=["CLFs", "Macro/NotCalib/"])
df

Unnamed: 0,CLFs,Macro/NotCalib/
0,kfr,0.691658
1,kpr,0.592531
2,ktmk,0.647207
3,ktr,0.587643
4,lfr,0.682471
5,lpr,0.586632
6,ltmk,0.661907
7,rep_bert,0.828855
8,xlnet_rep,0.805402


In [26]:
df["Macro/Calib"] = data_calib

In [30]:
df.drop(columns=["CLFs"]).values * 100

array([[69.16575414, 69.02674312],
       [59.25308489, 61.53338198],
       [64.72074904, 65.32542604],
       [58.76427235, 61.77397008],
       [68.24707737, 65.9383586 ],
       [58.66324212, 58.52594842],
       [66.19068114, 59.4812232 ],
       [82.88549204, 83.17521365],
       [80.54021559, 80.54021559]])

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open("/home/welton/data/datasets/data/mini_20ng/texts.txt", 'r') as fd:
    sents = fd.read().split('\n')

In [4]:
with open("/home/welton/data/datasets/data/mini_20ng/score.txt", 'r') as fd:
    labes_list = fd.read().split('\n')

In [5]:
labes_list.pop()

''

In [6]:
labels = np.array([int (l) for l in labes_list])

In [7]:
sents.pop()

''

In [8]:
len(sents)

2000

In [9]:
tf = TfidfVectorizer(strip_accents='unicode', stop_words='english')

In [10]:
X = tf.fit_transform(sents).toarray()

In [11]:
X.shape

(2000, 23845)

In [12]:
sp = pd.read_pickle("/home/welton/data/datasets/data/mini_20ng/splits/split_10.pkl")

In [13]:
sp

Unnamed: 0,train_idxs,test_idxs,fold_id
0,"[165, 182, 186, 187, 188, 189, 190, 191, 194, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0
1,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[165, 182, 186, 187, 188, 189, 190, 191, 194, ...",1
2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[372, 377, 382, 383, 385, 388, 389, 391, 392, ...",2
3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[574, 575, 576, 579, 581, 585, 586, 590, 591, ...",3
4,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[765, 767, 769, 771, 773, 774, 775, 777, 778, ...",4
5,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[969, 972, 976, 977, 978, 981, 982, 983, 984, ...",5
6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1152, 1155, 1156, 1157, 1158, 1160, 1162, 116...",6
7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1357, 1359, 1364, 1367, 1371, 1373, 1374, 137...",7
8,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1577, 1578, 1582, 1587, 1590, 1591, 1592, 159...",8
9,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1741, 1746, 1751, 1753, 1757, 1758, 1763, 177...",9


In [14]:
def get_xy(X_all, y_all, sp, fold, train_test_val):
    idxs = sp.iloc[fold][f"{train_test_val}_idxs"]
    x, y = X_all[idxs], y_all[idxs]
    return x, y

In [15]:

output_dir = "/home/welton/data/representations/mini_20ng/10_folds/tr"
os.makedirs(output_dir, exist_ok=True)

labels_dir = "/home/welton/data/datasets/labels/split_10_with_val/mini_20ng"
os.makedirs(output_dir, exist_ok=True)

for fold in np.arange(10):
    
    X_train, y_train = get_xy(X, labels, sp, fold, "train")
    os.makedirs(f"{output_dir}/{fold}", exist_ok=True)
    np.savez(f"{output_dir}/{fold}/train.npz", X_train=X_train, y_train=y_train)
    
    X_test, y_test = get_xy(X, labels, sp, fold, "test")
    np.savez(f"{output_dir}/{fold}/test.npz", X_test=X_test, y_test=y_test)

    os.makedirs(f"{labels_dir}/{fold}", exist_ok=True)
    np.save(f"{labels_dir}/{fold}/test.npy", y_test)
    np.save(f"{labels_dir}/{fold}/train.npy", y_train)

In [2]:
loader = np.load("/home/welton/data/representations/20ng/10_folds/tr/0/train.npz", allow_pickle=True)

In [8]:
sp = pd.read_pickle("/home/welton/data/datasets/data/20ng/splits/split_10_with_val.pkl")

In [5]:
train_test_split(loader["X_train"].tolist(), test_size=0.2)

[<13563x24304 sparse matrix of type '<class 'numpy.float64'>'
 	with 1010258 stored elements in Compressed Sparse Row format>,
 <3391x24304 sparse matrix of type '<class 'numpy.float64'>'
 	with 249329 stored elements in Compressed Sparse Row format>]

In [7]:
loader["X_train"], loader["X_train"].tolist()

(array(<16954x24304 sparse matrix of type '<class 'numpy.float64'>'
 	with 1259587 stored elements in Compressed Sparse Row format>,
       dtype=object),
 <16954x24304 sparse matrix of type '<class 'numpy.float64'>'
 	with 1259587 stored elements in Compressed Sparse Row format>)

In [13]:
for dataset in ["webkb", "acm", "reut", "20ng"]:
    sp = pd.read_pickle(f"/home/welton/data/datasets/data/{dataset}/splits/split_10.pkl")
    for fold in np.arange(10):
        x = np.array(sp.iloc[fold]["train_idxs"]).argsort()
        print(np.unique(x == np.arange(x.shape[0])))


[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]


In [32]:
L = np.load("/home/welton/data/normal_probas/split_10/mini_20ng/10_folds/ktr/0/0/eval.npz")
L["X_eval"].shape, L["y_eval"].shape

((120, 3), (120,))

In [31]:
L = np.load("/home/welton/data/normal_probas/split_10/mini_20ng/10_folds/ktr/0/0/test.npz")
L["X_test"].shape, L["y_test"].shape

((600, 3), (600,))

In [2]:
np.vstack([])

ValueError: need at least one array to concatenate