In [1]:
from load import Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
d = Dataset("cellcycle", nan_strategy="none")

In [3]:
from typing import Union, Literal
from feature_selection import ModSelectKBest, IterativeSelect
from sklearn.tree import DecisionTreeClassifier
from hiclass import MultiLabelLocalClassifierPerNode

FeatureSelector = Union[ModSelectKBest, IterativeSelect]
# Can't be imported for some reason
ImputerStrategy = Union[Literal["drop"],
                        Literal["knn"],
                        Literal["mean"],
                        Literal["median"],
                        Literal["most_frequent"],
                        Literal["constant"]]

IMPUTER_STRATEGY = "mean"
IMPUTER_KWARGS = {}
MODEL_STEPS = [
    ("model", MultiLabelLocalClassifierPerNode(DecisionTreeClassifier())),
]

In [5]:
from typing import Any, Callable, Dict, Optional, Tuple

from feature_selection import fill_reshape
from handle_nan import NumericImputer


def prep_dataset(dataset: Dataset,
                 imputer_strategy: ImputerStrategy,
                 imputer_kwargs: Optional[Dict[str, Any]] = None,
                 x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
                 ) -> Tuple[Dict[str, pd.DataFrame],
                            NumericImputer]:
    imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))
    data: Dict[str, pd.DataFrame] = {}

    data["x_train"] = dataset.x_train()
    if x_train_prep is not None:
        data["x_train"] = x_train_prep(data["x_train"])
    data["y_train"] = dataset.y_train()
    imputer.fit(data["x_train"], data["y_train"])

    data["x_valid"] = dataset.x_valid()
    data["y_valid"] = dataset.y_valid()
    data["x_valid"] = imputer.transform(data["x_valid"])
    
    data["x_test"] = dataset.x_test()
    data["y_test"] = dataset.y_test()
    data["y_test_reshaped"] = fill_reshape(data["y_test"])
    return data, imputer

In [6]:
from typing import Any, Callable, Dict, List, Optional, Tuple
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from hiclass.metrics import f1
from handle_nan import NumericImputer
from feature_selection import fill_reshape

def evaluate(dataset: Optional[Dataset] = None,
             data: Optional[Dict[str, pd.DataFrame]] = None,
             model_steps: List[Tuple[str, BaseEstimator]] = [],
             imputer_strategy: ImputerStrategy = "mean",
             imputer_kwargs: Optional[Dict[str, Any]] = None,
             x_train_prep: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
             name: str = "baseline",
             verbose: bool = True,
             verbose_pipe: bool = False):
    if data is None:
        if dataset is None:
            raise ValueError("Either dataset or data must be provided")
        data, imputer = prep_dataset(dataset,
                                     imputer_strategy,
                                     imputer_kwargs,
                                     x_train_prep)
    else:
        imputer = NumericImputer(strategy=imputer_strategy, **(imputer_kwargs or {}))

    pipeline = Pipeline([
        ("imputer", imputer),
        *model_steps,
    ], verbose=verbose_pipe, memory="cache")
    
    pipeline.fit(data["x_train"], data["y_train"])
    y_pred = pipeline.predict(data["x_test"])
    micro_score = f1(data["y_test_reshaped"], fill_reshape(y_pred), "micro")
    macro_score = f1(data["y_test_reshaped"], fill_reshape(y_pred), "macro")
    if verbose:
        print(f"{name}: {micro_score:.4f} | {macro_score:.4f}", flush=True)
    return name, micro_score, macro_score
    

In [7]:
base_name, base_micro_score, base_macro_score = evaluate(
    dataset=d,
    model_steps=MODEL_STEPS,
    imputer_strategy="mean",
    verbose=False,
)
print(f"{base_name}: {base_micro_score:.4f} | {base_macro_score:.4f}", flush=True)


baseline: 0.4629 | 0.4465


In [8]:
from typing import Any, Dict, Iterable, Optional
from feature_selection import ModSelectKBest, IterativeSelect
from itertools import product
from pprint import PrettyPrinter


pp = PrettyPrinter(indent=4)


def feature_selection(dataset: Dataset,
                      model_steps: List[Tuple[str, BaseEstimator]],
                      imputer_strategy: ImputerStrategy,
                      imputer_kwargs: Optional[Dict[str, Any]] = None,
                      n_feature_splits: Optional[int] = None,
                      n_features: Optional[Iterable[int]] = None,
                      n_epochs: int | Iterable[int] = 100,
                      verbose: bool = True,
                      verbose_pipe: bool = False):
    data, _ = prep_dataset(dataset, imputer_strategy, imputer_kwargs)
    train_n_features = data["x_train"].shape[1]
    
    if n_features is None:
        if n_feature_splits is None:
            n_feature_splits = 5

        n_features = [round(k * train_n_features / (n_feature_splits + 1))
                      for k
                      in range(1, n_feature_splits + 1)]
    elif any(not (0 < k <= train_n_features) for k in n_features):
            raise ValueError("Invalid number of features")
    
    if isinstance(n_epochs, int):
        n_epochs = [n_epochs]
    
    selectors = (
        ("Select K-Best",
         [{"k": k} for k in n_features]),
        ("Iterative Random Select",
         [
             {
                 "k": k,
                 "x_valid": data["x_valid"],
                 "y_valid": data["y_valid"],
                 "epochs": epochs,
                 "verbose": verbose,
             }
             for k, epochs
             in product(n_features, n_epochs)
         ])
    )
    
    for key, kwargs in selectors:
        for kw in kwargs:
            selector = (ModSelectKBest(**kw)
                        if key == "Select K-Best"
                        else IterativeSelect(**kw))
            
            name, micro_score, macro_score = evaluate(
                data=data,
                model_steps=[("selector", selector), *model_steps],
                imputer_strategy=imputer_strategy,
                imputer_kwargs=imputer_kwargs,
                name=f"{key} ({kw.get('k', 'X')})",
                verbose=verbose,
                verbose_pipe=verbose_pipe
            )
            yield name, micro_score, macro_score


In [9]:
feat_sel_results = feature_selection(d,
                                     MODEL_STEPS,
                                     imputer_strategy=IMPUTER_STRATEGY,
                                     imputer_kwargs=IMPUTER_KWARGS,
                                     n_feature_splits=1,
                                     n_epochs=1,
                                     verbose=False)

print(f"{base_name:<20}: {base_micro_score:.4f} | {base_macro_score:.4f}", flush=True)
for sel_name, sel_micro_score, sel_macro_score in feat_sel_results:
    print(f"{sel_name:<20}: {sel_micro_score:.4f} | {sel_micro_score:.4f}", flush=True)

baseline            : 0.4629 | 0.4465
Select K-Best (38)  : 0.4519 | 0.4519
