In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
from sklearn.metrics import adjusted_mutual_info_score


from hisel import select, hsic
from hisel.select import FeatureType, HSICSelector as Selector

In [None]:
k = 5
n = 10000
d = 30

In [None]:
x0 = np.random.randint(k, size=(n, 1))
x1 = np.random.randint(k, size=(n, 1))
ms = np.random.randint(low=2, high=20, size = d-2)
others = [np.random.choice(m, size=(n, 1)) for m in ms]
all_ = np.concatenate(
    [x0, x1] + others,
    axis=1
)
y = np.asarray(x0 == x1, dtype=int) # k + x0 - x1 # np.asarray(x0 == x1, dtype=int)
permuter =  np.random.permutation(np.eye(d, dtype=int).T).T
x = np.array(all_ @ permuter, dtype=int)
expected_features = [np.argmax(permuter[0, :]), np.argmax(permuter[1, :])]

In [None]:
assert np.all(x[:, expected_features[0]] == x0[:, 0])
assert np.all(x[:, expected_features[1]] == x1[:, 0])

In [None]:
sns.scatterplot(x = x0[:, 0] - x1[:, 0], y = y[:, 0])

In [None]:
xdf = pd.DataFrame(x, columns = [f'x{i}' for i in range(d)])
ydf = pd.Series(y[:, 0], name='y')

### Selection with marginal 1D ksg mutual info

In [None]:
ksgselection, mis = select.ksgmi(xdf, ydf, threshold=0.05)

In [None]:
mis

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Marginal KSG selection: {sorted(ksgselection)}')

In [None]:
mis

### Selection with HSIC Lasso

In [None]:
selector = Selector(x, y, xfeattype=FeatureType.DISCR, yfeattype=FeatureType.DISCR)

In [None]:
batch_size = n // 10
minibatch_size = 200
number_of_epochs = 3
threshold = .0
device = None # run on CPU

In [None]:
hsiclasso_selection = selector.select(
    number_of_features=2,
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    device=device
)

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'HSIC Lasso selection: {sorted(hsiclasso_selection)}')

### Confirm that HSIC_b correctly assigns highest dependence to the correct selection

In [None]:
correct_dependence = n * n * hsic.hsic_b(
    x[:, list(expected_features)],
    y
)
nsel = np.random.randint(low=1, high=d)
random_selection = np.random.choice(list(range(d)), replace=False, size=nsel)
random_dependence = n * n * hsic.hsic_b(
    x[:, list(random_selection)],
    y
)

In [None]:
print(f'HSIC-estimated dependence between correct selection and target: {correct_dependence}')
print(f'HSIC-estimated dependence between random selection and target: {random_dependence}')

### Selection with 2D discrete mutual information

In [None]:
def onedimlabel(x):
    assert x.ndim == 2
    ns = np.amax(x, axis=0)
    res = np.array(x[:, 0], copy=True)
    m = 1
    for i in range(1, x.shape[1]):
        m *= max(1, ns[i-1])
        res += (1+m) * x[:, i]
    return res

In [None]:
l = 2
miscores = {subset: 
           adjusted_mutual_info_score(onedimlabel(x[:, list(subset)]), y[:, 0])
            for subset in itertools.combinations(list(range(d)), l)
            
}

In [None]:
s = (0,1)
mi = 0
for k, v in miscores.items():
    if v > mi:
        s = k
        mi = v
twod_mi_selection = s

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'2D discrete MI selection: {sorted(twod_mi_selection)}')

### Selection with Boruta

In [None]:
from arfs.feature_selection import allrelevant
from arfs.feature_selection.allrelevant import Leshy
from sklearn.ensemble import RandomForestClassifier

In [None]:
n_estimators = 'auto'
perc = 95
alpha = 0.05
importance = "shap"
two_step = True
max_iter = 100
random_state = None
verbose = 0
keep_weak = False

In [None]:
xdf = pd.DataFrame(x, columns = [f'f{i}' for i in range(d)])
yser = pd.Series(y[:, 0], name='y')

In [None]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=8)

In [None]:
leshy = Leshy(
    rf,
    n_estimators=n_estimators,
    perc=perc,
    alpha=alpha,
    importance=importance,
    two_step=two_step,
    max_iter=max_iter,
    random_state=random_state,
    verbose=verbose,
    keep_weak=keep_weak,
)

In [None]:
leshy.fit(xdf, yser)
leshy_selection = [int(col.replace('f', '')) for col in leshy.selected_features_]

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Boruta selection: {sorted(leshy_selection)}')