# Description

Stacking of probabilities + TFIDF representation.

# Libraries/Configs

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import itertools
import warnings

from typing import List, Tuple

import numpy as np
import scipy as scp
import pandas as pd

# Modelling
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from optuna.integration import OptunaSearchCV
from joblib import load, dump

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../../scr")
from constants import DATASETS, REPRESENTATIONS, ALGORITHMS
from models import get_classifier

Execution configurations:

In [4]:
dir_projeto = "/home/christian/Documentos/mestrado/projetos/projeto_stacking2/stacking_text_classification"

dir_cls_input = f"{dir_projeto}/data/classification_input"
dir_meta_input = f"{dir_projeto}/data/meta_layer_input"

dir_output = f"{dir_projeto}/data/stacking_output"

algorithms_execution = list(ALGORITHMS.values())  # All 18 algorithms
datasets_execution = ["webkb", "20ng", "acm", "reut"]
representation = "tmk"
meta_layer = "linear_svm"
n_folds = 10
seed = 42

# Otimization
opt_cv = 5
opt_n_iter = 10
opt_scoring = "f1_macro"

Functions:

In [5]:
def load_x_y(
    file, 
    test_train : str
) -> Tuple[np.ndarray, np.ndarray]:
    loaded = np.load(file, allow_pickle=True)
    
    X = loaded[f"X_{test_train}"]
    y = loaded[f"y_{test_train}"]
    
    if X.size == 1:
        X = X.item()

    return X, y


def read_train_test_meta(
    dir_meta_input: str,
    dataset: str,
    n_folds: int,
    fold_id: int,
    algorithms: List[str]
) -> Tuple[np.ndarray, np.ndarray]:
    Xs_train, Xs_test = [], []

    for alg in algorithms:
        file_train_meta = f"{dir_meta_input}/{dataset}/{n_folds}_folds/{alg}/{fold_id}/train.npz"
        file_test_meta = f"{dir_meta_input}/{dataset}/{n_folds}_folds/{alg}/{fold_id}/test.npz"

        X_train_meta, _ = load_x_y(file_train_meta, 'train')
        X_test_meta, _ = load_x_y(file_test_meta, 'test')

        Xs_train.append(X_train_meta)
        Xs_test.append(X_test_meta)

    X_train_meta = np.hstack(Xs_train)
    X_test_meta = np.hstack(Xs_test)
    
    return X_train_meta, X_test_meta

# Execution

In [6]:
def execute_optimization(
    classifier_name: str, 
    file_model: str,
    opt_cv: int, 
    opt_n_iter: int,
    opt_scoring: str,
    X_train: np.ndarray,
    y_train: np.ndarray,
    seed: int = 42
):
    # Optimization/Training
    classifier, hyperparameters = get_classifier(classifier_name=classifier_name)
    pipeline = Pipeline([
        ("scaler", StandardScaler(with_mean=False)), 
        ("classifier", classifier)
    ])
    hyperparameters = {f"classifier__{k}": v for k, v in hyperparameters.items()}
    
    optuna_search = OptunaSearchCV(
        pipeline,
        hyperparameters,
        cv=StratifiedKFold(opt_cv, shuffle=True, random_state=seed),
        error_score="raise",
        n_trials=opt_n_iter,
        random_state=seed,
        scoring=opt_scoring
    )
    
    os.makedirs(os.path.dirname(file_model), exist_ok=True)
    if os.path.exists(file_model):
        print("\tModel already executed! Loading model...", end="")
        optuna_search = load(file_model)
    else:
        print("\tExecuting model...", end="")
        optuna_search.fit(X_train, y_train)
        dump(optuna_search, file_model)
        
    return optuna_search

In [7]:
iterations = itertools.product(datasets_execution, range(n_folds))
for (dataset, fold_id) in iterations:
    print(f"Dataset {dataset.upper():10s} - Fold {str(fold_id):5s}")
    
    # Reading labels
    file_train_cls = f"{dir_cls_input}/{dataset}/{n_folds}_folds/{representation}/{fold_id}/train.npz"
    file_test_cls = f"{dir_cls_input}/{dataset}/{n_folds}_folds/{representation}/{fold_id}/test.npz"

    _, y_train = load_x_y(file_train_cls, "train")
    _, y_test = load_x_y(file_test_cls, "test")
    
    # Reading meta-layer input (classification probabilities)
    X_train_meta, X_test_meta = read_train_test_meta(
        dir_meta_input, dataset, n_folds, fold_id, algorithms_execution)
    
    # Reading extra features
    X_train_extra = pd.read_pickle("../../data/extra_features/fold_0/webkb/train.csv").values
    X_test_extra = pd.read_pickle("../../data/extra_features/fold_0/webkb/test.csv").values
    
    # Check shapes
    assert len(y_test) == len(X_test_extra), "X meta input differs from y shape."
    assert len(y_train) == len(X_train_extra), "X meta input differs from y shape."
    
    # Concat proba + Extra Fatures
    new_X_train_meta = np.hstack([X_train_meta, X_train_extra])
    new_X_test_meta = np.hstack([X_test_meta, X_test_extra])
    
    # Optimization/Training
    file_model = f"{dir_output}/{dataset}/{n_folds}_folds/{meta_layer}/proba_tmk_input/fold_{fold_id}/model.joblib"
    optuna_search = execute_optimization(
        meta_layer, 
        file_model,
        opt_cv, 
        opt_n_iter,
        opt_scoring,
        new_X_train_meta,
        y_train
    )
    
    # Prediction
    y_pred = optuna_search.predict(new_X_test_meta)
    f1_macro = f1_score(y_test, y_pred, average="macro")
    f1_micro = f1_score(y_test, y_pred, average="micro")
    
    msg = f"""
    \tF1-Macro: {f1_macro:.4f}
    \tF1-Micro: {f1_micro:.4f}
    """
    print(msg)

Dataset WEBKB      - Fold 0    
	Model already executed! Loading model...

ValueError: X has 129 features, but StandardScaler is expecting 336 features as input.

In [8]:
pd.read_pickle("../../data/extra_features/fold_0/webkb/train.csv").fillna(0.)

Unnamed: 0,idf,chi2,docs_length
0,17126.324675,166.120697,77
1,15072.780488,257.762628,123
2,13426.634855,188.803384,241
3,15890.393103,199.406828,145
4,13987.573529,375.363024,68
...,...,...,...
7371,13761.651961,271.388170,204
7372,14769.755725,339.233880,131
7373,13264.870229,411.894652,131
7374,13976.547425,249.872397,369
