In [1]:
from load import Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
d = Dataset("cellcycle", nan_strategy="none")

In [3]:
from typing import Union, Literal
from feature_selection import ModSelectKBest, IterativeSelect
from sklearn.tree import DecisionTreeClassifier
from hiclass import MultiLabelLocalClassifierPerNode
from sklearn.linear_model import LogisticRegression

FeatureSelector = Union[ModSelectKBest, IterativeSelect]
# Can't be imported for some reason
ImputerStrategy = Union[Literal["drop"],
                        Literal["knn"],
                        Literal["mean"],
                        Literal["median"],
                        Literal["most_frequent"],
                        Literal["constant"]]

IMPUTER_STRATEGY = "mean"
IMPUTER_KWARGS = {}
MODEL_STEPS = [
    # ("model", MultiLabelLocalClassifierPerNode(DecisionTreeClassifier())),
    ("model", MultiLabelLocalClassifierPerNode(
        local_classifier=LogisticRegression(
            penalty='l2',
            C=0.01,
            solver='lbfgs',
            max_iter=10000
        )
    ))
]

In [4]:
from typing import Callable, Dict, Optional, Tuple, Any

from feature_selection import fill_reshape
from handle_nan import NumericImputer


def prep_dataset(dataset: Dataset,
                 imputer_strategy: ImputerStrategy,
                 imputer_kwargs: Optional[Dict[str, Any]] = None,
                 x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
                 ) -> Tuple[Dict[str, pd.DataFrame],
                            NumericImputer]:
    imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))
    data: Dict[str, pd.DataFrame] = {}

    data["x_train"] = dataset.x_train()
    if x_train_prep is not None:
        data["x_train"] = x_train_prep(data["x_train"])
    data["y_train"] = dataset.y_train()
    imputer.fit(data["x_train"], data["y_train"])

    data["x_valid"] = dataset.x_valid()
    data["y_valid"] = dataset.y_valid()
    data["x_valid"] = imputer.transform(data["x_valid"])
    
    data["x_test"] = dataset.x_test()
    data["y_test"] = dataset.y_test()
    data["y_test_reshaped"] = fill_reshape(data["y_test"])
    return data, imputer

In [5]:
from typing import Any, Callable, Dict, List, Optional, Tuple
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from hiclass.metrics import f1
from handle_nan import NumericImputer
from feature_selection import fill_reshape

def evaluate(dataset: Optional[Dataset] = None,
             data: Optional[Dict[str, pd.DataFrame]] = None,
             model_steps: List[Tuple[str, BaseEstimator]] = [],
             imputer_strategy: ImputerStrategy = "mean",
             imputer_kwargs: Optional[Dict[str, Any]] = None,
             x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
             name: str = "baseline",
             verbose: bool = False):
    if data is None:
        if dataset is None:
            raise ValueError("Either dataset or data must be provided")
        data, imputer = prep_dataset(dataset,
                                     imputer_strategy,
                                     imputer_kwargs,
                                     x_train_prep)
    else:
        imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))

    pipeline = Pipeline([
        ("imputer", imputer),
        *model_steps,
    ], verbose=verbose, memory="cache")
    
    pipeline.fit(data["x_train"], data["y_train"])
    y_pred = pipeline.predict(data["x_test"])
    score = f1(data["y_test_reshaped"], fill_reshape(y_pred))
    if verbose:
        print(f"{name}: {score}", flush=True)
    return name, score
    

In [6]:
base_name, base_f1_score = evaluate(
    dataset=d,
    model_steps=MODEL_STEPS,
    imputer_strategy="mean",
    verbose=True
)
print(f"{base_name:<10}: {base_f1_score:.4f}", flush=True)


[Pipeline] ............. (step 2 of 2) Processing model, total= 1.3min
baseline: 0.4073478144274604
baseline  : 0.4073


In [40]:
from typing import Any, Dict, Iterable, Optional
from feature_selection import ModSelectKBest, IterativeSelect
from itertools import product
from pprint import PrettyPrinter


pp = PrettyPrinter(indent=4)


def feature_selection(dataset: Dataset,
                      model_steps: List[Tuple[str, BaseEstimator]],
                      imputer_strategy: ImputerStrategy,
                      imputer_kwargs: Optional[Dict[str, Any]] = None,
                      n_feature_splits: Optional[int] = None,
                      n_features: Optional[Iterable[int]] = None,
                      n_epochs: int | Iterable[int] = 100,
                      verbose: bool = False):
    data, _ = prep_dataset(dataset, imputer_strategy, imputer_kwargs)
    train_n_features = data["x_train"].shape[1]
    
    if n_features is None:
        if n_feature_splits is None:
            n_feature_splits = 5

        n_features = [round(k * train_n_features / (n_feature_splits + 1))
                      for k
                      in range(1, n_feature_splits + 1)]
    elif any(not (0 < k <= train_n_features) for k in n_features):
            raise ValueError("Invalid number of features")
    
    if isinstance(n_epochs, int):
        n_epochs = [n_epochs]
    
    selectors = (
        ("k_best",
         [{"k": k} for k in n_features]),
        ("iterative",
         [
             {
                 "k": k,
                 "x_valid": data["x_valid"],
                 "y_valid": data["y_valid"],
                 "epochs": epochs,
                 "verbose": verbose,
             }
             for k, epochs
             in product(n_features, n_epochs)
         ])
    )
    
    for key, kwargs in selectors:
        for kw in kwargs:
            selector = ModSelectKBest(**kw) if key == "k_best" else IterativeSelect(**kw)
            
            name, score = evaluate(
                data=data,
                model_steps=[("selector", selector), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"{key}_{kw.get('k', 'X')}",
                verbose=verbose
            )
            yield name, score


In [41]:
feat_sel_results = feature_selection(d,
                                     MODEL_STEPS,
                                     imputer_strategy=IMPUTER_STRATEGY,
                                     imputer_kwargs=IMPUTER_KWARGS,
                                     n_feature_splits=1,
                                     n_epochs=1,
                                     verbose=False)

print(f"{base_name:<16}: {base_f1_score:.4f}", flush=True)
for sel_name, f1_score in feat_sel_results:
    print(f"{sel_name:<16}: {f1_score:.4f}", flush=True)

baseline        : 0.4582
k_best_38       : 0.4600




iterative_38    : 0.4602


In [7]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from hiclass.metrics import f1
from feature_selection import fill_reshape
from scipy import stats


def add_noise_to_train(train_df, percentage):
    noisy_train = train_df.copy()
    numeric_columns = train_df.select_dtypes(include=[np.number]).columns
    noise = (train_df[numeric_columns].max() - train_df[numeric_columns].min()) * (percentage / 100)
    noisy_train[numeric_columns] += noise
    return noisy_train

class NoisePreprocessor(TransformerMixin, BaseEstimator):
    def __init__(self, scaling_method='standard', outlier_method='zscore', outlier_threshold=3.0):
        self.scaling_method = scaling_method
        self.outlier_method = outlier_method
        self.outlier_threshold = outlier_threshold
        self.scaler_ = None

    def fit(self, X, y=None):
        if self.outlier_method in ['zscore', 'iqr', 'clip', 'mean']:
            X_clean, _ = self._remove_outliers(X, y)
        else:
            X_clean = X
        
        return self

    def transform(self, X, y=None):
        X = X.copy()

        if self.outlier_method in ['zscore', 'iqr', 'clip', 'mean']:
            X, y = self._remove_outliers(X, y)
        
        if y is not None:
            return X, y
        return X

    def _remove_outliers(self, df, y=None):
        if self.outlier_method == 'zscore':
            return self._remove_outliers_zscore(df, y)
        elif self.outlier_method == 'iqr':
            return self._remove_outliers_iqr(df, y)
        elif self.outlier_method == 'clip':
            return self._clip_outliers(df, y)
        elif self.outlier_method == 'mean':
            return self._replace_outliers_with_mean(df, y)
        else:
            raise ValueError(f"Unknown outlier method: {self.outlier_method}")

    def _remove_outliers_zscore(self, df, y=None):
        numeric_cols = df.select_dtypes(include=[np.number])
        z_scores = np.abs(stats.zscore(numeric_cols))
        filtered_entries = (z_scores < self.outlier_threshold).all(axis=1)
        if y is not None:
            return df.loc[filtered_entries], y.loc[filtered_entries]
        return df.loc[filtered_entries], y

    def _remove_outliers_iqr(self, df, y=None):
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        filter = (df >= (Q1 - 1.5 * IQR)) & (df <= (Q3 + 1.5 * IQR))
        filtered_entries = filter.all(axis=1)
        if y is not None:
            return df.loc[filtered_entries], y.loc[filtered_entries]
        return df.loc[filtered_entries], y

    def _clip_outliers(self, df, y=None, lower_percentile=0.01, upper_percentile=0.99):
        lower_bound = df.quantile(lower_percentile)
        upper_bound = df.quantile(upper_percentile)
        df_clipped = df.clip(lower=lower_bound, upper=upper_bound, axis=1)
        return df_clipped, y

    def _replace_outliers_with_mean(self, df, y=None):
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            mean = df[col].mean()
            std = df[col].std()
            outliers = np.abs((df[col] - mean) / std) > self.outlier_threshold
            df.loc[outliers, col] = mean
        return df, y

    def _scale_data(self, df):
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df.loc[:, numeric_cols] = self.scaler_.transform(df[numeric_cols])
        return df


In [8]:
def noisy_data(dataset: Dataset,
                      model_steps: List[Tuple[str, BaseEstimator]],
                      imputer_strategy: ImputerStrategy,
                      imputer_kwargs: Optional[Dict[str, Any]] = None,
                      verbose: bool = False):
    for i in range(-10, 11):
        if i == 0:
            continue
        data, _ = prep_dataset(
                    dataset,
                    imputer_strategy,
                    imputer_kwargs,
                    x_train_prep=lambda x: add_noise_to_train(x, percentage=i)
                )
        
        args = [
            {'outlier_method': 'mean',
             'outlier_threshold': 3.0,
             'scaling_method': None
            }
        ]
        
        for kw in args:
            preprocessor = NoisePreprocessor(**kw)
            
            name, score = evaluate(
                data=data,
                model_steps=[("preprocessor", preprocessor), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"numeric {i}%",
                verbose=verbose
            )
            yield name, score

In [9]:
feat_sel_results = noisy_data(d,
                             MODEL_STEPS,
                             imputer_strategy=IMPUTER_STRATEGY,
                             imputer_kwargs=IMPUTER_KWARGS,
                             verbose=False)

print(f"{base_name:<16}: {base_f1_score:.4f}", flush=True)
for sel_name, f1_score in feat_sel_results:
    print(f"{sel_name:<16}: {f1_score:.4f}", flush=True)

baseline        : 0.4073
numeric -10%    : 0.3602
numeric -9%     : 0.3630
numeric -8%     : 0.3649
numeric -7%     : 0.3674



KeyboardInterrupt

