In [1]:
%%capture
!pip install git+https://github.com/Tomko10/hiclass.git scikit-learn numpy pandas matplotlib

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### handle_nan.py

In [3]:
from typing import Literal, Union
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer, KNNImputer


def impute_mean(x: pd.DataFrame) -> pd.DataFrame:
    """
    Impute NaN values by simple column mean

    :param x: data to be imputed
    :return: new DataFrame with values imputed
    """
    numeric = x.select_dtypes(include=["number"])
    rest = x.select_dtypes(exclude=["number"])
    imp = SimpleImputer(strategy="mean").fit(numeric)
    return pd.concat([pd.DataFrame(imp.transform(numeric), columns=imp.feature_names_in_), rest], axis=1)


def impute_knn(x: pd.DataFrame, k=5) -> pd.DataFrame:
    """
    Impute NaN values by KNN algorithm

    :param x: data to be imputed
    :param k: number of nearest neighbors
    :return: new DataFrame with values imputed
    """
    numeric = x.select_dtypes(include=["number"])
    rest = x.select_dtypes(exclude=["number"])
    imp = KNNImputer(n_neighbors=k).fit(numeric)
    return pd.concat([pd.DataFrame(imp.transform(numeric), columns=imp.feature_names_in_), rest], axis=1)


# disclaimer: may remove all the rows
def remove_nan(x: pd.DataFrame) -> pd.DataFrame:
    """
    Remove rows with NaN values from the data

    :param x: data, possibly containing NaN values
    :return: features and labels without rows containing NaN
    """
    nan_rows = x[x.isnull().T.any()].index
    return x.drop(labels=nan_rows)


ImputerStrategy = Union[Literal["drop"],
                        Literal["knn"],
                        Literal["mean"],
                        Literal["median"],
                        Literal["most_frequent"],
                        Literal["constant"]]


class NumericImputer(TransformerMixin, BaseEstimator):
    def __init__(self, strategy: ImputerStrategy = 'mean', **kwargs):
        self.strategy = strategy
        self.kwargs = kwargs

    def fit(self, X, y=None):
        self.numeric_columns_ = X.select_dtypes(include=["number"]).columns
        self.rest_columns_ = X.select_dtypes(exclude=["number"]).columns

        if self.strategy == "knn":
            if "n_neighbors" not in self.kwargs:
                self.kwargs["n_neighbors"] = 5
            self.imputer_ = KNNImputer(**self.kwargs)
        elif self.strategy == "drop":
            self.imputer_ = None
        else:
            self.imputer_ = SimpleImputer(strategy=self.strategy,
                                          **self.kwargs)

        self.imputer_.fit(X[self.numeric_columns_])
        return self

    def transform(self, X):
        if self.strategy == "drop":
            return X.dropna()

        numeric = X[self.numeric_columns_]
        rest = X[self.rest_columns_]
        numeric_imputed = pd.DataFrame(self.imputer_.transform(numeric),
                                       columns=self.numeric_columns_)
        return pd.concat([numeric_imputed, rest], axis=1)


### load.py

In [4]:
import zipfile
from os import sep, remove
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


class Dataset:
    """
    Represents ARFF dataset for hierarchical classification.

    Requires directory structure as follows:

        load.py
        datasets_FUN -> XXX_FUN -> XXX_FUN.{train,test,valid}.arff.zip
    """

    def __init__(self, dataset_name: str, nan_strategy: str = "mean", args=None):
        """
        Create Dataset object, consisting of training/testing/validation data

        :param dataset_name: name of the dataset (without _FUN suffix) - one of {cellcycle, church, derisi, eisen,
        expr, gasch1, gasch2, hom, pheno, seq, spo, struc}
        :param nan_strategy: strategy to be used for NaN values - one of "mean", "knn", "remove". If not
        provided or not one of allowed, "mean" is used
        :param args: possible dictionary of arguments to NaN-handling functions

        the hom_FUN dataset is quite large and takes a lot of time to process
        struc_FUN takes moderate amount of time (around 5 minutes on my laptop)
        """
        path = sep.join(["datasets_FUN", f"{dataset_name}_FUN"])

        def _read(which: str) -> pd.DataFrame:
            file = f"{dataset_name}_FUN.{which}.arff"
            p = sep.join([path, file+".zip"])
            zipfile.ZipFile(p).extract(file)

            # Apparently, scipy cannot read hierarchical attributes
            def _read_arff(f: str) -> pd.DataFrame:
                attr_names = []
                with open(f) as arff_file:
                    reading_attrs = True
                    types = {"class": "object"}
                    # gasch1 dataset has two columns of the same name
                    used_names, i = set(), 0
                    while reading_attrs:
                        attr = arff_file.readline().strip().split()
                        if (not attr) or (attr[0].upper() != "@ATTRIBUTE"):
                            if attr and attr[0].upper() == "@DATA":
                                reading_attrs = False
                            continue
                        if attr[1] in used_names:
                            attr[1] += f"_{i}"
                            i += 1
                        attr_names.append(attr[1])
                        used_names.add(attr[1])
                        if attr[2].startswith("{"):
                            types[attr[1]] = "category"

                    d = pd.read_csv(arff_file, names=attr_names, na_values=["?"], dtype=types)

                    # might be a problem when "mean" imputing one-hot encoded category with > 2 levels
                    categories = [column for column, t in types.items() if t == "category"]
                    if categories:
                        enc = OneHotEncoder(sparse_output=False)
                        encoded_columns = enc.fit_transform(d[categories])
                        encoded_df = pd.DataFrame(encoded_columns,
                                                  columns=enc.get_feature_names_out(categories))
                        d = pd.concat([d.drop(columns=categories), encoded_df], axis=1)

                    d["class"] = d["class"].map(lambda x: [label for label in x.split("@")])

                    # touches also test data, but since HiClass classifiers cannot handle NaN values, this has to be
                    # done anyway
                    if nan_strategy == "remove" and which != "test":
                        d = remove_nan(d)
                    elif nan_strategy == "knn":
                        k = 5
                        if args is not None:
                            k = args.get("k", 5)
                        d = impute_knn(d, k)
                    elif nan_strategy == "mean":
                        d = impute_mean(d)

                return d

            data = _read_arff(file)
            remove(file)
            return data

        self.train = _read("train")
        self.test = _read("test")
        self.valid = _read("valid")

    def _x(self, data: pd.DataFrame, expand: bool = False) -> pd.DataFrame:
        df = data.copy()
        if expand:
            df = self.expand_multi_class(df)
        df.pop("class")
        return df

    def _y(self, data: pd.DataFrame, expand: bool = False) -> pd.Series:
        df = data.copy()
        if expand:
            df = self.expand_multi_class(df)
            return df["class"].apply(lambda x: x.split("/"))
        return df["class"].apply(lambda x: list(map((lambda y: y.split("/")), x)))

    def x_train(self, expand: bool = False) -> pd.DataFrame:
        """
        Get features (not classes) of examples in training part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with classes removed
        """
        return self._x(self.train, expand)

    def y_train(self, expand: bool = False) -> pd.Series:
        """
        Get classes (not features) of examples in training part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with features removed
        """
        return self._y(self.train, expand)

    def x_test(self, expand: bool = False) -> pd.DataFrame:
        """
        Get features (not classes) of examples in test part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with classes removed
        """
        return self._x(self.test, expand)

    def y_test(self, expand: bool = False) -> pd.Series:
        """
        Get classes (not features) of examples in test part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with features removed
        """
        return self._y(self.test, expand)

    def x_valid(self, expand: bool = False) -> pd.DataFrame:
        """
        Get features (not classes) of examples in validation part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with classes removed
        """
        return self._x(self.valid, expand)

    def y_valid(self, expand: bool = False) -> pd.Series:
        """
        Get classes (not features) of examples in validation part of dataset

        :param expand: whether to expand multi-label rows to multiple records
        :return: copy of dataframe with features removed
        """
        return self._y(self.valid, expand)

    @staticmethod
    def expand_multi_class(df: pd.DataFrame) -> pd.DataFrame:
        """
        Expand the dataset so that each row has just one class label

        :param df: dataset to be expanded
        :return: the same dataset, but multi-label rows are duplicated for each label
        """
        return df.explode("class", ignore_index=True)


### feature_selection.py

In [5]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_selection import (
    SelectorMixin,
    mutual_info_classif,
    SelectKBest,
)
from math import sqrt, floor
from sklearn.tree import DecisionTreeClassifier
from hiclass.MultiLabelLocalClassifierPerNode import (
    MultiLabelLocalClassifierPerNode,
)
from hiclass.metrics import f1
from random import seed


def fill_reshape(y) -> np.ndarray:
    """
    Transform the multi-label part of the dataset to regular shape, so F1 metric can be used

    :param y: labels (not expanded)
    :return: array of x=hierarchy, y=labels per example, z=examples
    """
    if isinstance(y, pd.Series):
        max_len = y.apply(len).agg("max")
        depth = y.apply(lambda x: max(map(len, x))).agg("max")
    else:
        max_len = max(map(len, y))
        depth = max(map(lambda x: max(map(len, x)), y))

    return np.array([
        [
            list(label) + [""] * (depth - len(label))
            for label in row
        ] + [
            [""] * depth
        ] * (max_len - len(row))
        for row in y
    ])


class ModSelectKBest(SelectorMixin, BaseEstimator):
    """
    Perform "flat" selection of k best parameters based on mutual information. The hierarchy is ignored and labels
    worked with as strings

    Sample usage:
        import load
        import feature_selection
        from sklearn.tree import DecisionTreeClassifier
        from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
        from hiclass.metrics import f1

        dataset = load.Dataset("cellcycle", nan_strategy="mean")
        x_train, y_train = dataset.x_train(), dataset.y_train()
        x_test, y_test = dataset.x_test(), dataset.y_test()

        tree = DecisionTreeClassifier()
        classifier = MultiLabelLocalClassifierPerNode(local_classifier=tree)

        selector = ModSelectKBest().fit(x_train, y_train)
        x_train = selector.transform(x_train)

        classifier.fit(x_train, y_train)

        y_pred = classifier.predict(selector.transform(x_test))
        print(f1(fill_reshape(y_test), y_pred))
        """
    def __init__(self, *, k=10, sqrt_features=False):
        """
        Create ModSelectKBest object

        :param k: number of features to be chosen
        :param sqrt_features: take square root of number of features as k
        """
        self.k = k
        self.sqrt_features = sqrt_features

    def set_params(self, k=10, sqrt_features=False) -> 'ModSelectKBest':
        self.k = k
        self.sqrt_features = sqrt_features
        return self

    def fit(self, x, y):
        self.n_features_in_ = x.shape[1]

        y_pd = pd.DataFrame(y, columns=["class"])
        x_exp = self._expand_multi_class(pd.concat([x, y_pd], axis=1))
        y_exp = x_exp["class"].copy()
        x_exp.drop(columns="class", inplace=True)

        y_exp = y_exp.map(lambda label: "/".join(label))
        if self.sqrt_features:
            self.k = floor(sqrt(x.shape[1]))
        self.selector_ = SelectKBest(mutual_info_classif, k=self.k).fit(x_exp, y_exp)
        self.feature_names_in_ = x.columns
        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.selector_.get_support(False)

    def _more_tags(self):
        return {"requires_y": True}

    @staticmethod
    def _expand_multi_class(df: pd.DataFrame) -> pd.DataFrame:
        """
        Expand the dataset so that each row has just one class label

        :param df: dataset to be expanded
        :return: the same dataset, but multi-label rows are duplicated for each label
        """
        return df.explode("class", ignore_index=True)


class IterativeSelect(SelectorMixin, BaseEstimator):
    """
    Perform iterative selection of k best parameters based on fit to hiclass.MultiLabelLocalClassifierPerNode +
    sklearn.tree.DecisionTreeClassifier measured as F1 score.

    Selection of feature subset for each epoch is (pseudo)random, thus results may vary if seed is not specified.

    Sample usage:
        import load
        import feature_selection
        from sklearn.tree import DecisionTreeClassifier
        from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
        from hiclass.metrics import f1

        dataset = load.Dataset("cellcycle", nan_strategy="mean")

        x_train, y_train = dataset.x_train(), dataset.y_train()
        x_valid, y_valid = dataset.x_valid(), dataset.y_valid()
        x_test, y_test = dataset.x_test(), dataset.y_test()

        tree = DecisionTreeClassifier()
        classifier = MultiLabelLocalClassifierPerNode(local_classifier=tree)

        selector = IterativeSelect(x_valid=x_valid, y_valid=y_valid, r_seed=42).fit(x_train, y_train)
        x_train = selector.transform(x_train)

        classifier.fit(x_train, y_train)

        y_pred = classifier.predict(selector.transform(x_test))
        print(f1(fill_reshape(y_test), y_pred))
    """
    def __init__(self,
                 *,
                 x_valid: pd.DataFrame,
                 y_valid: pd.DataFrame,
                 k=10,
                 sqrt_features=False,
                 epochs=10,
                 r_seed=None,
                 verbose=False):
        """
        Create IterativeSelect object

        :param x_valid: validation data to compare selections - features
        :param y_valid: validation data to compare selections - labels
        :param k: number of features to choose
        :param sqrt_features: choose square root of number of features instead of k
        :param epochs: number of iterations
        :param r_seed: seed to subset generator
        :param verbose: print logs to output
        """
        self.x_valid = x_valid
        self.y_valid = y_valid
        self.k = k
        self.sqrt_features = sqrt_features
        self.epochs = epochs
        self.r_seed = r_seed
        self.verbose = verbose

    def set_params(self,
                   *,
                   x_valid: pd.DataFrame,
                   y_valid: pd.DataFrame,
                   k=10,
                   sqrt_features=False,
                   epochs=10,
                   r_seed=None) -> 'IterativeSelect':
        self.x_valid = x_valid
        self.y_valid = y_valid
        self.k = k
        self.sqrt_features = sqrt_features
        self.epochs = epochs
        self.r_seed = r_seed
        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        mask = np.zeros(len(self.x_valid.columns), dtype=bool)
        mask[self.sample_best_] = True
        return mask

    def fit(self, X, y) -> 'IterativeSelect':
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = X.columns

        if self.sqrt_features:
            self.k = floor(sqrt(X.shape[1]))

        if self.r_seed is not None:
            seed(self.r_seed)

        y_valid_reshaped = fill_reshape(self.y_valid)

        f1_best = 0
        self.sample_best_ = []

        for i in range(self.epochs):
            s = np.zeros(self.n_features_in_, dtype=bool)
            s[np.random.choice(self.n_features_in_,
                               self.k,
                               replace=False)] = True

            tree = DecisionTreeClassifier()
            classifier = MultiLabelLocalClassifierPerNode(
                local_classifier=tree
            )

            classifier.fit(X.loc[:, s], y)

            y_pred = classifier.predict(self.x_valid.loc[:, s])
            score = f1(y_valid_reshaped, y_pred)

            if score > f1_best:
                f1_best = score
                self.sample_best_ = s

            if self.verbose:
                print(f"Epoch {i+1}/{self.epochs}: F1 score "
                      f"on validation set {round(score, 5)}",
                      flush=True)

        return self

    def _more_tags(self):
        return {"requires_y": True}


## Common constants, model definition

In [6]:
from typing import Union, Literal
from sklearn.tree import DecisionTreeClassifier
from hiclass import MultiLabelLocalClassifierPerNode
from sklearn.linear_model import LogisticRegression

FeatureSelector = Union[ModSelectKBest, IterativeSelect]

IMPUTER_STRATEGY = "mean"
IMPUTER_KWARGS = {}
MODEL_STEPS = [
    # ("model", MultiLabelLocalClassifierPerNode(DecisionTreeClassifier())),
    ("model", MultiLabelLocalClassifierPerNode(
        local_classifier=LogisticRegression(
            penalty='l2',
            C=0.01,
            solver='lbfgs',
            max_iter=10000
        )
    ))
]

In [7]:
d = Dataset("cellcycle", nan_strategy="none")

## Common functions (evaluation etc.)

In [8]:
from typing import Callable, Dict, Optional, Tuple, Any


def prep_dataset(dataset: Dataset,
                 imputer_strategy: ImputerStrategy,
                 imputer_kwargs: Optional[Dict[str, Any]] = None,
                 x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
                 ) -> Tuple[Dict[str, pd.DataFrame],
                            NumericImputer]:
    imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))
    data: Dict[str, pd.DataFrame] = {}

    data["x_train"] = dataset.x_train()
    if x_train_prep is not None:
        data["x_train"] = x_train_prep(data["x_train"])
    data["y_train"] = dataset.y_train()
    imputer.fit(data["x_train"], data["y_train"])

    data["x_valid"] = dataset.x_valid()
    data["y_valid"] = dataset.y_valid()
    data["x_valid"] = imputer.transform(data["x_valid"])
    
    data["x_test"] = dataset.x_test()
    data["y_test"] = dataset.y_test()
    data["y_test_reshaped"] = fill_reshape(data["y_test"])
    return data, imputer

In [9]:
from typing import Any, Callable, Dict, List, Optional, Tuple
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from hiclass.metrics import f1

def evaluate(dataset: Optional[Dataset] = None,
             data: Optional[Dict[str, pd.DataFrame]] = None,
             model_steps: List[Tuple[str, BaseEstimator]] = [],
             imputer_strategy: ImputerStrategy = "mean",
             imputer_kwargs: Optional[Dict[str, Any]] = None,
             x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
             name: str = "baseline",
             verbose: bool = True,
             verbose_pipe: bool = False):
    if data is None:
        if dataset is None:
            raise ValueError("Either dataset or data must be provided")
        data, imputer = prep_dataset(dataset,
                                     imputer_strategy,
                                     imputer_kwargs,
                                     x_train_prep)
    else:
        imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))

    pipeline = Pipeline([
        ("imputer", imputer),
        *model_steps,
    ], verbose=verbose_pipe, memory="cache")
    
    pipeline.fit(data["x_train"], data["y_train"])
    y_pred = pipeline.predict(data["x_test"])
    micro_score = f1(data["y_test_reshaped"], fill_reshape(y_pred), "micro")
    macro_score = f1(data["y_test_reshaped"], fill_reshape(y_pred), "macro")
    if verbose:
        print(f"{name}: {micro_score:.4f} | {macro_score:.4f}", flush=True)
    return name, micro_score, macro_score
    

In [10]:
base_name, base_micro_score, base_macro_score = evaluate(
    dataset=d,
    model_steps=MODEL_STEPS,
    imputer_strategy="mean",
    verbose=False,
)
print(f"{base_name}: {base_micro_score:.4f} | {base_macro_score:.4f}", flush=True)


baseline: 0.4073 | 0.4044


## Feature selection

In [11]:
from typing import Any, Dict, Iterable, Optional
from itertools import product
from pprint import PrettyPrinter


pp = PrettyPrinter(indent=4)


def feature_selection(dataset: Dataset,
                      model_steps: List[Tuple[str, BaseEstimator]],
                      imputer_strategy: ImputerStrategy,
                      imputer_kwargs: Optional[Dict[str, Any]] = None,
                      n_feature_splits: Optional[int] = None,
                      n_features: Optional[Iterable[int]] = None,
                      n_epochs: int | Iterable[int] = 100,
                      verbose: bool = True,
                      verbose_pipe: bool = False):
    data, _ = prep_dataset(dataset, imputer_strategy, imputer_kwargs)
    train_n_features = data["x_train"].shape[1]
    
    if n_features is None:
        if n_feature_splits is None:
            n_feature_splits = 5

        n_features = [round(k * train_n_features / (n_feature_splits + 1))
                      for k
                      in range(1, n_feature_splits + 1)]
    elif any(not (0 < k <= train_n_features) for k in n_features):
            raise ValueError("Invalid number of features")
    
    if isinstance(n_epochs, int):
        n_epochs = [n_epochs]
    
    selectors = (
        ("Select K-Best",
         [{"k": k} for k in n_features]),
        ("Iterative Random Select",
         [
             {
                 "k": k,
                 "x_valid": data["x_valid"],
                 "y_valid": data["y_valid"],
                 "epochs": epochs,
                 "verbose": verbose,
             }
             for k, epochs
             in product(n_features, n_epochs)
         ])
    )
    
    for key, kwargs in selectors:
        for kw in kwargs:
            selector = (ModSelectKBest(**kw)
                        if key == "Select K-Best"
                        else IterativeSelect(**kw))
            
            name, micro_score, macro_score = evaluate(
                data=data,
                model_steps=[("selector", selector), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"{key} ({kw.get('k', 'X')})",
                verbose=verbose,
                verbose_pipe=verbose_pipe
            )
            yield name, micro_score, macro_score


In [12]:
feat_sel_results = feature_selection(d,
                                     MODEL_STEPS,
                                     imputer_strategy=IMPUTER_STRATEGY,
                                     imputer_kwargs=IMPUTER_KWARGS,
                                     n_feature_splits=1,
                                     n_epochs=1,
                                     verbose=False)

print(f"{base_name:<20}: {base_micro_score:.4f} | {base_macro_score:.4f}", flush=True)
for sel_name, sel_micro_score, sel_macro_score in feat_sel_results:
    print(f"{sel_name:<20}: {sel_micro_score:.4f} | {sel_micro_score:.4f}", flush=True)

baseline            : 0.4073 | 0.4044


## Noisy data

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from hiclass.metrics import f1
from scipy import stats


def add_noise_to_train(train_df, percentage):
    noisy_train = train_df.copy()
    numeric_columns = train_df.select_dtypes(include=[np.number]).columns
    noise = (train_df[numeric_columns].max() - train_df[numeric_columns].min()) * (percentage / 100)
    noisy_train[numeric_columns] += noise
    return noisy_train

def add_nominal_noise(train_df, percentage):
    noisy_df = train_df.copy()
    nominal_columns = df.select_dtypes(include=['category', 'object']).columns
    for col in nominal_columns:
        if col == 'class':
            continue
        unique_values = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()
        indices = noisy_df.sample(frac=percentage / 100).index
        noisy_df.loc[indices, col] = np.random.choice(unique_values, size=len(indices))
    return noisy_df

class NoisePreprocessor(TransformerMixin, BaseEstimator):
    def __init__(self, scaling_method='standard', outlier_method='zscore', outlier_threshold=3.0):
        self.scaling_method = scaling_method
        self.outlier_method = outlier_method
        self.outlier_threshold = outlier_threshold
        self.scaler_ = None

    def fit(self, X, y=None):
        if self.outlier_method in ['zscore', 'iqr', 'clip', 'mean']:
            X_clean, _ = self._remove_outliers(X, y)
        else:
            X_clean = X
        
        return self

    def transform(self, X, y=None):
        X = X.copy()

        if self.outlier_method in ['zscore', 'iqr', 'clip', 'mean']:
            X, y = self._remove_outliers(X, y)
        
        if y is not None:
            return X, y
        return X

    def _remove_outliers(self, df, y=None):
        if self.outlier_method == 'zscore':
            return self._remove_outliers_zscore(df, y)
        elif self.outlier_method == 'iqr':
            return self._remove_outliers_iqr(df, y)
        elif self.outlier_method == 'clip':
            return self._clip_outliers(df, y)
        elif self.outlier_method == 'mean':
            return self._replace_outliers_with_mean(df, y)
        else:
            raise ValueError(f"Unknown outlier method: {self.outlier_method}")

    def _remove_outliers_zscore(self, df, y=None):
        numeric_cols = df.select_dtypes(include=[np.number])
        z_scores = np.abs(stats.zscore(numeric_cols))
        filtered_entries = (z_scores < self.outlier_threshold).all(axis=1)
        if y is not None:
            return df.loc[filtered_entries], y.loc[filtered_entries]
        return df.loc[filtered_entries], y

    def _remove_outliers_iqr(self, df, y=None):
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        filter = (df >= (Q1 - 1.5 * IQR)) & (df <= (Q3 + 1.5 * IQR))
        filtered_entries = filter.all(axis=1)
        if y is not None:
            return df.loc[filtered_entries], y.loc[filtered_entries]
        return df.loc[filtered_entries], y

    def _clip_outliers(self, df, y=None, lower_percentile=0.01, upper_percentile=0.99):
        lower_bound = df.quantile(lower_percentile)
        upper_bound = df.quantile(upper_percentile)
        df_clipped = df.clip(lower=lower_bound, upper=upper_bound, axis=1)
        return df_clipped, y

    def _replace_outliers_with_mean(self, df, y=None):
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            mean = df[col].mean()
            std = df[col].std()
            outliers = np.abs((df[col] - mean) / std) > self.outlier_threshold
            df.loc[outliers, col] = mean
        return df, y

    def _scale_data(self, df):
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df.loc[:, numeric_cols] = self.scaler_.transform(df[numeric_cols])
        return df


In [None]:
def noisy_data(dataset: Dataset,
                      model_steps: List[Tuple[str, BaseEstimator]],
                      imputer_strategy: ImputerStrategy,
                      imputer_kwargs: Optional[Dict[str, Any]] = None,
                      verbose: bool = False):
    for i in range(-10, 11):
        if i == 0:
            continue
        data, _ = prep_dataset(
                    dataset,
                    imputer_strategy,
                    imputer_kwargs,
                    x_train_prep=lambda x: add_noise_to_train(x, percentage=i)
                )
        
        args = [
            {'outlier_method': 'mean',
             'scaling_method': None
            }
        ]
        
        for kw in args:
            preprocessor = NoisePreprocessor(**kw)
            
            name, score = evaluate(
                data=data,
                model_steps=[("preprocessor", preprocessor), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"numeric {i}%",
                verbose=verbose
            )
            yield name, score

    for i in range(1, 11):
        if i == 0:
            continue
        data, _ = prep_dataset(
                    dataset,
                    imputer_strategy,
                    imputer_kwargs,
                    x_train_prep=lambda x: add_nominal_noise(x, percentage=i)
                )
        
        args = [
            {'outlier_method': 'mean',
             'scaling_method': None
            }
        ]
        
        for kw in args:
            preprocessor = NoisePreprocessor(**kw)
            
            name, score = evaluate(
                data=data,
                model_steps=[("preprocessor", preprocessor), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"nominal {i}%",
                verbose=verbose
            )
            yield name, score

In [None]:
feat_sel_results = noisy_data(d,
                             MODEL_STEPS,
                             imputer_strategy=IMPUTER_STRATEGY,
                             imputer_kwargs=IMPUTER_KWARGS,
                             verbose=False)

print(f"{base_name:<16}: {base_f1_score:.4f}", flush=True)
for sel_name, f1_score in feat_sel_results:
    print(f"{sel_name:<16}: {f1_score:.4f}", flush=True)

baseline        : 0.4073
numeric -10%    : 0.3602
numeric -9%     : 0.3630
numeric -8%     : 0.3649
numeric -7%     : 0.3674



KeyboardInterrupt



## Semi-supervised learning

In [None]:
from sklearn.semi_supervised import SelfTrainingClassifier
from random import sample
from sklearn.tree import DecisionTreeClassifier
from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from hiclass.metrics import f1
import matplotlib.pyplot as plt


def create_classifier():
    """
    Create and configure a MultiLabelLocalClassifierPerNode with a self-training decision tree classifier.

    Returns:
    MultiLabelLocalClassifierPerNode: An instance of MultiLabelLocalClassifierPerNode configured
                                      with a SelfTrainingClassifier that uses a DecisionTreeClassifier
                                      as its base estimator. This setup is particularly suited for scenarios
                                      where multilabel classification is required and some of the training
                                      data might be unlabeled.
    Sample usage:
        import feature_selection
        from hiclass.metrics import f1
        from load import Dataset

        dataset = Dataset("cellcycle", nan_strategy="mean")
        x_test, y_test = dataset.x_test(), dataset.y_test()
        x_train, y_train = dataset.x_train(), dataset.y_train()

        classifier = create_classifier()

        selector = feature_selection.ModSelectKBest().fit(x_train, y_train)
        x_train = selector.transform(x_train)

        classifier.fit(x_train, remove_labels(dataset.y_train(), percentage))

        y_pred = classifier.predict(selector.transform(x_test))
        print(f1(feature_selection.fill_reshape(y_test), y_pred))
    """
    tree = DecisionTreeClassifier()
    self_train = SelfTrainingClassifier(base_estimator=tree)
    return MultiLabelLocalClassifierPerNode(local_classifier=self_train)


def remove_labels(labels, percentage):
    """
    Randomly sets a specified percentage of labels in the input list to [[-1]].

    :param labels (list of list): A list of labels, where each label itself can be a list of items.
    :param percentage (float): The fraction of labels to remove, represented as a float between 0 and 1.
                        For example, 0.2 means 20% of the labels will be set to [[-1]].

    Returns:
    list of list of list: The modified list of labels with a percentage of its elements set to [[-1]].

    Raises:
    - ValueError: If 'percentage' is not between 0 and 1.
    """
    if not (0 <= percentage <= 1):
        raise ValueError("Percentage of unlabeled data must be between 0 and 1.")
    size = len(labels)
    rand_indexes = sample(range(0, size), round(size*percentage))
    for i in rand_indexes:
        labels[i] = [[-1]]
    return labels


def train(dataset: Dataset, percentage: float):
    """
    Create and run semi-supervised learner on the dataset with the percentage of unlabeled data.

    :param dataset: An object representing the dataset.
    :param percentage: The fraction of the training data to be treated as unlabeled, expressed as a
      decimal (e.g., 1.0 for 100%, 0.2 for 20%).
    :param feats: features to be selected from data

    Sample usage:
        from load import Dataset

        train(Dataset("eisen"), 0.1)
    """
    x_test, y_test = dataset.x_test(), dataset.y_test()
    x_train, y_train = dataset.x_train(), dataset.y_train()

    classifier = create_classifier()

    selector = ModSelectKBest().fit(x_train, y_train)
    x_train = selector.transform(x_train)

    classifier.fit(x_train, remove_labels(dataset.y_train(), percentage))

    y_pred = classifier.predict(selector.transform(x_test))
    return f1(fill_reshape(y_test), y_pred)
    
results = []
percentages = [0.01, 0.025, 0.05, 0.075, 0.1]

for percentage in percentages:
    results.append(train(Dataset("derisi"), percentage))

plt.figure()
plt.plot(percentages, results, marker='o')
plt.title('Model Performance vs. Percentage of Unlabeled Data')
plt.xlabel('Percentage of Unlabeled Data')
plt.ylabel('F1 Score')
plt.grid(True)
plt.show()