This notebook present an example where `hisel` is capable of performing an exact selection, whereas other methods like [sklearn.feature_selection.mutual_info_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html) are not. 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
from sklearn.metrics import adjusted_mutual_info_score


from hisel import select, hsic, categorical

In [2]:
k = 5
n = 2000
d = 20

In [3]:
x0 = np.random.randint(k, size=(n, 1))
x1 = np.random.randint(k, size=(n, 1))
ms = np.random.randint(low=2, high=20, size = d-2)
others = [np.random.choice(m, size=(n, 1)) for m in ms]
all_ = np.concatenate(
    [x0, x1] + others,
    axis=1
)
y = np.asarray(x0 == x1, dtype=int) # k + x0 - x1 # np.asarray(x0 == x1, dtype=int)
permuter =  np.random.permutation(np.eye(d, dtype=int).T).T
x = np.array(all_ @ permuter, dtype=int)
expected_features = [np.argmax(permuter[0, :]), np.argmax(permuter[1, :])]

In [4]:
assert np.all(x[:, expected_features[0]] == x0[:, 0])
assert np.all(x[:, expected_features[1]] == x1[:, 0])

In [5]:
# sns.scatterplot(x = x0[:, 0] - x1[:, 0], y = y[:, 0])

In [6]:
xdf = pd.DataFrame(x, columns = [f'x{i}' for i in range(d)])
ydf = pd.Series(y[:, 0], name='y')

### Selection with marginal 1D ksg mutual info

Under the hood, this method calls 
[sklearn.feature_selection.mutual_info_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)

In [7]:
ksgselection, mis = select.ksgmi(xdf, ydf, threshold=0.01)
ksg_selection = [int(feat.split('x')[-1]) for feat in ksgselection]

ksg-mi preprocessing: 20 features are pre-selected


In [8]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Marginal KSG selection: {sorted(ksg_selection)}')

Expected features: [4, 19]
Marginal KSG selection: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


## Selection with HISEL

In [9]:
results = categorical.select(
    xdf, ydf,
    num_permutations=80,
    max_iter=1,
    parallel=True
)
hisel_selection = list(results.indexes)

Number of categorical features: 20


100%|█████████████████████████████████████| 152/152 [00:00<00:00, 472247.56it/s]


Number of selected categorical features: 2


In [10]:
print(f'Expected features: {sorted(expected_features)}')
print(f'HISEL selection: {sorted(hisel_selection)}')

Expected features: [4, 19]
HISEL selection: [4, 19]


### Confirm that HSIC_b correctly assigns highest dependence to the correct selection

In [11]:
correct_dependence = n * n * hsic.hsic_b(
    x[:, list(expected_features)],
    y
)
nsel = np.random.randint(low=1, high=d)
random_selection = np.random.choice(list(range(d)), replace=False, size=nsel)
random_dependence = n * n * hsic.hsic_b(
    x[:, list(random_selection)],
    y
)

In [12]:
print(f'HSIC-estimated dependence between correct selection and target: {correct_dependence}')
print(f'HSIC-estimated dependence between random selection and target: {random_dependence}')

HSIC-estimated dependence between correct selection and target: 1.0000000000000002
HSIC-estimated dependence between random selection and target: 0.25537749584046077
