In [None]:
import numpy as np
import pandas as pd
from scipy.stats import special_ortho_group
import matplotlib.pyplot as plt

from hisel import select

In [None]:
n = 5000
n_cat = 10
n_cont = 30
n_relcat = 2
n_relcont = 8
dim_y = 1

In [None]:
ms = np.random.randint(low=5, high=8, size=n_cat)
cats = [np.random.randint(m, size=(n, 1)) for m in ms]
cat = np.concatenate(cats, axis=1)
cat_ = np.expand_dims(cat, axis=2)
catdf = pd.DataFrame(cat, columns = [f'cat{n}' for n in range(n_cat)])

In [None]:
acat = np.random.permutation(
    np.concatenate((np.diag(np.random.choice([-1, 1], size=n_relcat)), 
                    np.zeros(shape=(n_relcat, n_cat - n_relcat), dtype=int)),
                   axis=1).T).T
tcat = np.expand_dims(np.ones(shape=(1, n_relcat), dtype=int) @ acat, axis=0)
relevant_cats = np.sort(np.argsort(np.sum(np.abs(acat), axis=0))[::-1][:n_relcat])

In [None]:
cont = np.random.uniform(low=-1, high=1, size=(n, n_cont))
cont_ = np.expand_dims(cont, axis=2)
contdf = pd.DataFrame(cont, columns = [f'cont{n}' for n in range(n_cont)])

In [None]:
xdf = pd.merge(catdf, contdf, left_index=True, right_index=True)

In [None]:
u1 = special_ortho_group.rvs(n_relcont)
u2 = special_ortho_group.rvs(n_relcont)
acont = np.random.permutation(
    np.concatenate((np.eye(n_relcont), np.zeros((n_relcont, n_cont - n_relcont))), axis=1).T).T
ct1 = np.expand_dims(u1 @ acont, axis=0)
ct2 = np.expand_dims(u2 @ acont, axis=0)
relevant_conts = n_cat + np.sort(np.argsort(np.sum(np.abs(acont), axis=0))[::-1][:n_relcont])

In [None]:
relevant_features = np.sort(np.concatenate((relevant_cats, relevant_conts)))

In [None]:
t = np.random.uniform(low=-1, high=1, size=(1, dim_y, n_relcont))

In [None]:
chooser = tcat @ cat_
q = np.quantile(chooser, .5)
y = np.squeeze(t @ (
    (chooser > q) * ct1 @ cont_ +
    (chooser <= q) * ct2 @ cont_
))

In [None]:
ydf = pd.DataFrame(y, columns = [f'y{n}' for n in range(dim_y)])

## Selection of categorical features

In [None]:
cat_selection = select.select(
    catdf, 
    ydf,
    hsic_threshold=.01,
    batch_size=n,
    minibatch_size=200,
    number_of_epochs=3,
    use_preselection=False,
)

In [None]:
expected = sorted(list(relevant_cats))
selected = sorted(list(cat_selection.hsic_selection))
leftout = sorted(list(set(expected).difference(set(selected))))
print(f'Expected features:\n{expected}')
print(f'Selected features:\n{selected}')
print(f'Left-out features:\n{leftout}')

## Selection of continuous features 

In [None]:
cont_selection = select.select(
    contdf, 
    ydf,
    hsic_threshold=.01,
    batch_size=n,
    minibatch_size=200,
    number_of_epochs=3,
    use_preselection=False,
)

In [None]:
expected = sorted(list(relevant_conts))
selected = sorted(list(n_cat + cont_selection.hsic_selection))
leftout = sorted(list(set(expected).difference(set(selected))))
of = n_cat  + cont_selection.hsic_ordered_features
impgrade_of_relconts = {n: int(np.squeeze(np.where(of==n))) for n in expected}
print(f'Expected features:\n{expected}')
print(f'Importance grade of relevant features:\n{impgrade_of_relconts}')
print(f'Selected features:\n{selected}')
print(f'Left-out features:\n{leftout}')

In [None]:
curve = cont_selection.regcurve
plt.plot(np.arange(1, 1+len(curve)), curve)

# HSIC selection 

In [None]:
selection = select.select(
    xdf, 
    ydf,
    hsic_threshold=.0075,
    batch_size=n,
    minibatch_size=200,
    number_of_epochs=3,
    use_preselection=False,
) # This is the longest to  run

In [None]:
expected = sorted(list(relevant_features))
selected = sorted(list(selection.hsic_selection))
leftout = sorted(list(set(expected).difference(set(selected))))
orderedfeats = list(selection.hsic_ordered_features)
of = selection.hsic_ordered_features
impgrade_of_relconts = {n:int(np.squeeze(np.where(of==n))) for n in expected}
print(f'Expected features:\n{expected}')
print(f'Importance grade of relevant features:\n{impgrade_of_relconts}')
print(f'Selected features:\n{selected}')
print(f'Left-out features:\n{leftout}')
print(f'Features in decreasing order of importance:\n{orderedfeats}')

## KSG selection

In [None]:
ksgfeatures, ksgmis = select.ksgmi(xdf, ydf, threshold=.01)

In [None]:
expected = sorted(list(relevant_features))
selected = sorted(list(ksgfeatures))
leftout = sorted(list(set(expected).difference(set(selected))))
print(f'Expected features:\n{expected}')
print(f'Selected features:\n{selected}')
print(f'Left-out features:\n{leftout}')

# HSIC selection with pre-selection 

In [None]:
selection = select.select(
    xdf, 
    ydf,
    mi_threshold=.0,
    hsic_threshold=.0095,
    batch_size=n,
    minibatch_size=200,
    number_of_epochs=3,
    use_preselection=True,
)

In [None]:
expected = sorted(list(relevant_features))
preselected = sorted(list(selection.preselection))
selected = sorted(list(selection.hsic_selection))
preleftout = sorted(list(set(expected).difference(set(preselected))))
leftout = sorted(list(set(expected).difference(set(selected))))
orderedfeats = list(selection.hsic_ordered_features)
mi_orderedfeats = list(selection.mi_ordered_features)
print(f'Expected features:\n{expected}')
print(f'Pre-selected features:\n{preselected}')
print(f'Pre-leftout features:\n{preleftout}')
print(f'Selected features:\n{selected}')
print(f'Left-out features:\n{leftout}')
print(f'Features in decreasing order of importance:\n{orderedfeats}')
print(f'Features in decreasing order of MI:\n{mi_orderedfeats}')