### Feature Selection

Imports

In [1]:
import numpy as np
import pandas as pd
import sys
from typing import Tuple, Union
sys.path.append('..')
from Aula1.Dataset import Dataset
sys.path.remove('..')
from typing import Callable
from scipy import stats
from f_classification import FClassification
from f_regression import FRegression

Get the data

In [2]:
def read_csv(filename: str,
             sep: str = ',',
             features: bool = False,
             label: bool = False) -> Dataset:
    """
    Reads a csv file (data file) into a Dataset object
    """
    data = pd.read_csv(filename, sep=sep)

    if features and label:
        features = data.columns[:-1]
        label = data.columns[-1]
        X = data.iloc[:, :-1].to_numpy()
        y = data.iloc[:, -1].to_numpy()

    elif features and not label:
        features = data.columns
        X = data.to_numpy()
        y = None

    elif not features and label:
        X = data.iloc[:, :-1].to_numpy()
        y = data.iloc[:, -1].to_numpy()
        features = None
        label = None

    else:
        X = data.to_numpy()
        y = None
        features = None
        label = None

    return Dataset(X, y, features=features, label=label)

In [3]:
# Datasets
iris_dataset = read_csv('../../datasets/iris.csv', features=True, label=True)

random_dataset = Dataset.from_random(100, 10,5)
random_dataset.X[:, 2] = 0

### Variance Threshold

In [4]:
import numpy as np
import sys
sys.path.append('..')
from Aula1.Dataset import Dataset


class VarianceThreshold:
    """
    Feature Selection -> VarianceThreshold:
    Features com variância (array-like) no dataset de treino mais baixa do que o threshold (float) devem ser removidas do dataset.
    """

    def __init__(self, threshold: float = 0.0):
        
        if threshold < 0:
            raise ValueError("Threshold must be non-negative")

        # parameters
        self.threshold = threshold

        # attributes
        self.variance = None
    
    
    def fit(self, dataset: Dataset) -> 'VarianceThreshold':
        """ 
        Fazer fit do modelo de acordo com os dados de treino.
        """
        self.variance = np.var(dataset.X, axis=0)
        return self
    

    def transform(self, dataset: Dataset) -> Dataset:
        """
        Remove features with variance lower than the threshold.
        """
        X = dataset.X

        features_mask = self.variance > self.threshold
        X = X[:, features_mask]
        features = np.array(dataset.features)[features_mask]

        return Dataset(X=X, y=dataset.y, features=list(features), label=dataset.label)



    def fit_transform(self, dataset: Dataset) -> Dataset:
        """
        Fazer fit e de seguida transformar.
        """
        self.fit(dataset)
        return self.transform(dataset)

#### Test: VarianceThreshold

In [5]:
var_threshold = VarianceThreshold(0.5)
var_threshold.fit(iris_dataset)
transformed_dataset = var_threshold.transform(iris_dataset)
transformed_dataset.shape()

(150, 3)

In [6]:
transformed_dataset.X

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1],
       [5.4, 1.5, 0.2],
       [4.8, 1.6, 0.2],
       [4.8, 1.4, 0.1],
       [4.3, 1.1, 0.1],
       [5.8, 1.2, 0.2],
       [5.7, 1.5, 0.4],
       [5.4, 1.3, 0.4],
       [5.1, 1.4, 0.3],
       [5.7, 1.7, 0.3],
       [5.1, 1.5, 0.3],
       [5.4, 1.7, 0.2],
       [5.1, 1.5, 0.4],
       [4.6, 1. , 0.2],
       [5.1, 1.7, 0.5],
       [4.8, 1.9, 0.2],
       [5. , 1.6, 0.2],
       [5. , 1.6, 0.4],
       [5.2, 1.5, 0.2],
       [5.2, 1.4, 0.2],
       [4.7, 1.6, 0.2],
       [4.8, 1.6, 0.2],
       [5.4, 1.5, 0.4],
       [5.2, 1.5, 0.1],
       [5.5, 1.4, 0.2],
       [4.9, 1.5, 0.1],
       [5. , 1.2, 0.2],
       [5.5, 1.3, 0.2],
       [4.9, 1.5, 0.1],
       [4.4, 1.3, 0.2],
       [5.1, 1.5, 0.2],
       [5. , 1.3, 0.3],
       [4.5, 1.3

### Select K Best

In [7]:
class SelectKBest:
    """
    Selecionar features conforme k scores mais altos.
    O ranking das features é feito fazendo a computação dos scores de cada feature usando uma `scoring function`.
        - f_classification: ANOVA F-value between label/feature for classification tasks.
        - f_regression: F-value obtained from F-value of r's pearson correlation coefficients for regression tasks.
    """

    def __init__(self, score_func: Callable = FClassification.fit_transform, k: int = 10):
        """
        Selecionar features conforme o k scores mais altos..
        """

        self.k = k
        self.score_func = score_func
        self.F = None
        self.p = None
    
    def fit(self, dataset: Dataset) -> 'SelectKBest':
        """
        Faz o fit SelectKBest
        """
        self.F, self.p = self.score_func(dataset)
        return self

    def transform(self, dataset: Dataset) -> Dataset:
        """
        Transformar o dataset selecionando as k features com maior score.
        """
        idxs = np.argsort(self.F)[-self.k:]
        features = np.array(dataset.features)[idxs]
        return Dataset(X=dataset.X[:, idxs], y=dataset.y, features=list(features), label=dataset.label)
    
    def fit_transform(self, dataset: Dataset) -> Dataset:
        """
        Fit e de seguida transform.
        """
        self.fit(dataset)
        return self.transform(dataset)
    

#### Test: Select K Best

In [8]:
def f_classification(dataset: Dataset) -> Union[Tuple[np.ndarray, np.ndarray],
                                                Tuple[float, float]]:
    """
    Scoring function for classification problems. It computes one-way ANOVA F-value for the
    provided dataset. The F-value scores allows analyzing if the mean between two or more groups (factors)
    are significantly different. Samples are grouped by the labels of the dataset.
    """
    classes = dataset.get_classes()
    groups = [dataset.X[dataset.y == c] for c in classes]
    F, p = stats.f_oneway(*groups)
    return F, p

In [9]:
# Select K Best with f_classification
k_best = SelectKBest(score_func=f_classification, k=2)
k_best.fit_transform(iris_dataset).features

['petal_width', 'petal_length']