# Libraries

In [1]:
import os
import copy

import pandas as pd
import numpy as np

from distutils.dir_util import copy_tree
from sklearn.datasets import load_svmlight_file
from itertools import product

In [None]:
DATASET_INFO = {
    "20ng": 20,
    "acm": 11,
    "agnews": 4,
    "imdb_reviews": 10,
    "reut": 90,
    "sogou": 5,
    "yahoo": 10,
    "yelp_2015": 5,
    "webkb": 7
}

ID_MODEL = {
    "linear_svm/fast_text_1/raw_folds":            "sfr",
    "linear_svm/pte_1/raw_folds":                  "spr",
    "linear_svm/tf_idf_1/fs":                      "str",
    "linear_svm/tf_idf_1/meta_features_1/knn_cos": "stmk",
    "knn/fast_text_1/raw_folds":                   "kfr",
    "knn/pte_1/raw_folds":                         "kpr",
    "knn/tf_idf_1/fs":                             "ktr",
    "knn/tf_idf_1/meta_features_1/knn_cos":        "ktmk",
    "lr/fast_text_1/raw_folds":                    "lfr",
    "lr/pte_1/raw_folds":                          "lpr",
    "lr/tf_idf_1/fs":                              "ltr",
    "lr/tf_idf_1/meta_features_1/knn_cos":         "ltmk",
    "xgboost/fast_text_1/raw_folds":               "xfr",
    "xgboost/pte_1/raw_folds":                     "xpr",
    "xgboost/tf_idf_1/fs":                         "xtr",
    "xgboost/tf_idf_1/meta_features_1/knn_cos":    "xtmk",
    "xlnet":                                       "xlnet",
    "bert":                                        "bert"
}

DIR_PROJ_OLD = "/home/christian/arquivado/projeto_stacking"
DIR_PROJ_NEW = "/home/christian/stacking_text_classification"

MODELS = ID_MODEL.keys()
DATASETS = ["reut", "20ng", "acm", "webkb"]
SMALL_DATASETS = ["20ng", "acm", "webkb", "reut"]

# Formatting Meta-Layer Input

meta_layer:
* test: probs do teste
* train: probs do treino

In [2]:
for dataset, model in product(DATASETS, MODELS):
    # Configs
    model_id = ID_MODEL[model]
    n_folds = 10 if dataset in SMALL_DATASETS else 5
    print(dataset, model, model_id)

    # Execute each fold
    all_folds = list(range(n_folds))
    for fold_id in range(n_folds):
        # New files
        dir_fold = f"{DIR_PROJ_NEW}/data/meta_layer_input/{dataset}/{n_folds}_folds/{model_id}/{fold_id}"
        os.makedirs(dir_fold, exist_ok=True)
        file_train_new = f"{dir_fold}/train"
        file_test_new = f"{dir_fold}/test"
        
        # Skip if already executed
        if os.path.exists(f"{file_train_new}.npz") and os.path.exists(f"{file_test_new}.npz"):
            continue
            
        # Read test
        file_test_old = f"{DIR_PROJ_OLD}/output/classification/{dataset}/{n_folds}_folds/{model}/fold_{fold_id}/test.gz"
        X_test, y_test = load_svmlight_file(f=file_test_old, dtype=np.float64, zero_based=False, n_features=DATASET_INFO[dataset])
        X_test = X_test.toarray()
        
        # Read train
        train_folds = copy.deepcopy(all_folds)
        train_folds.remove(fold_id)
        
        X_train, y_train = [], []
        for fold_id_train in train_folds:
            file_train_old = f"{DIR_PROJ_OLD}/output/classification/{dataset}/{n_folds}_folds/{model}/fold_{fold_id_train}/test.gz"
            X_train_tmp, y_train_tmp = load_svmlight_file(f=file_train_old, dtype=np.float64, zero_based=False, n_features=DATASET_INFO[dataset])
            X_train_tmp = X_train_tmp.toarray()
            
            X_train.append(X_train_tmp)
            y_train.append(y_train_tmp)
        X_train = np.vstack(X_train)
        y_train = np.hstack(y_train)
        
        # Correct order train
        df_split = pd.read_pickle(f"{DIR_PROJ_OLD}/input/{dataset}/representations/{n_folds}_folds_splits.csv")
        df_split.drop(fold_id, axis=0, inplace=True)

        idxs_wrong = []
        for x in df_split["test_idxs"].values:
            idxs_wrong += x.tolist()
        idxs_correct = list(enumerate(idxs_wrong))
        idxs_correct = sorted(idxs_correct, key=lambda x: x[1])   
        idxs_correct = [x[0] for x in idxs_correct]
        
        X_train = X_train[idxs_correct]
        y_train = y_train[idxs_correct]
        
        break
    break

20ng linear_svm/fast_text_1/raw_folds sfr
20ng linear_svm/pte_1/raw_folds spr
20ng linear_svm/tf_idf_1/fs str
20ng linear_svm/tf_idf_1/meta_features_1/knn_cos stmk
20ng knn/fast_text_1/raw_folds kfr
20ng knn/pte_1/raw_folds kpr
20ng knn/tf_idf_1/fs ktr
20ng knn/tf_idf_1/meta_features_1/knn_cos ktmk
20ng lr/fast_text_1/raw_folds lfr
20ng lr/pte_1/raw_folds lpr
20ng lr/tf_idf_1/fs ltr
20ng lr/tf_idf_1/meta_features_1/knn_cos ltmk
20ng xgboost/fast_text_1/raw_folds xfr
20ng xgboost/pte_1/raw_folds xpr
20ng xgboost/tf_idf_1/fs xtr
20ng xgboost/tf_idf_1/meta_features_1/knn_cos xtmk
20ng xlnet xlnet
20ng bert bert
acm linear_svm/fast_text_1/raw_folds sfr
acm linear_svm/pte_1/raw_folds spr
acm linear_svm/tf_idf_1/fs str
acm linear_svm/tf_idf_1/meta_features_1/knn_cos stmk
acm knn/fast_text_1/raw_folds kfr
acm knn/pte_1/raw_folds kpr
acm knn/tf_idf_1/fs ktr
acm knn/tf_idf_1/meta_features_1/knn_cos ktmk
acm lr/fast_text_1/raw_folds lfr
acm lr/pte_1/raw_folds lpr
acm lr/tf_idf_1/fs ltr
acm lr/tf

FileNotFoundError: [Errno 2] No such file or directory: '/home/christian/arquivado/projeto_stacking/output/classification/yahoo/5_folds/knn/fast_text_1/raw_folds/fold_1/test.gz'