In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import special_ortho_group
from hisel.select import HSICSelector as Selector, FeatureType

# Continuous feature, discrete target

In [None]:
d = 50  # total number of features to select from
n = 3000 # sample size
n_features = 8 # relevant features for the classification

### Generate synthetic data

In [None]:
x = np.random.uniform(size=(n, d))
features = list(np.random.choice(d, replace=False, size=n_features))
z = x[:, features]
tt = np.expand_dims(
    special_ortho_group.rvs(n_features),
    axis=0
)
zz = np.expand_dims(z, axis=2)
u = (tt @ zz)[:, :, 0]
y = np.zeros(shape=(n, 1), dtype=int)
for i in range(1, n_features):
    y += np.asarray(u[:, [i-1]] > u[:, [i]], dtype=int)

In [None]:
selector = Selector(x, y, xfeattype=FeatureType.CONT, yfeattype=FeatureType.DISCR)
expected_features = list(np.array(selector.feature_names)[features])

### Run selection algorithm

In [None]:
batch_size = n // 2
minibatch_size = 500
number_of_epochs = 3
threshold = .075
device = None # run on CPU

#### Selection: We ask for the `n_features` most relevant features for the classification task

In [None]:
selection = selector.select(
    number_of_features=n_features, 
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    device=device,
)

#### Auto-selection: we do not prescribe the number of features to select

In [None]:
autoselection = selector.autoselect(
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    threshold=threshold,
    device=device,
)

In [None]:
print(f'Expected features:\n{sorted(expected_features)}')
print(f'Selected features:\n{sorted(selection)}')
print(f'Auto-selected features:\n{sorted(autoselection)}')

#### Visualise importance curve as more features are selected

In [None]:
paths = selector.lasso_path()
curve = np.cumsum(np.sort(paths.iloc[-1, :])[::-1])
plt.plot(np.arange(1, 1+len(curve)), curve)

#### Visualise lasso paths of relevant features

In [None]:
paths = selector.lasso_path()
paths /= paths.max().max()
paths.iloc[:, features].plot(figsize=(16, 10))

# Discrete features, discrete target

In [None]:
d = 50  # total number of features to select from
n = 3000 # sample size
n_features = 8 # relevant feature for the classification

### Generate synthetic data

In [None]:
ms = np.random.randint(low=2, high=2*n_features, size=(d,))
xs = [np.random.randint(m, size=(n, 1)) for m in ms]
x = np.concatenate(xs, axis=1)
features = list(np.random.choice(d, replace=False, size=n_features))
y = np.zeros(shape=(n, 1), dtype=int)
for i in range(1, n_features):
    y += np.asarray(x[:, [features[i-1]]] > x[:, [features[i]]], dtype=int)

In [None]:
selector = Selector(
    x,
    y, 
    xfeattype=FeatureType.CONT, # We treat x as continuous despite it being discrete. This improves accuracy.
    yfeattype=FeatureType.DISCR)
expected_features = list(np.array(selector.feature_names)[features])

### Run selection algorithm

In [None]:
batch_size = n // 2
minibatch_size = 500
number_of_epochs = 3
threshold = .075
device = None # run on CPU

#### Selection: We ask for the `n_features` most relevant features for the classification task

In [None]:
selection = selector.select(
    number_of_features=n_features, 
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    device=device,
)

#### Auto-selection: we do not prescribe the number of features to select

In [None]:
autoselection = selector.autoselect(
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    threshold=threshold,
    device=device,
)

In [None]:
print(f'Expected features:\n{sorted(expected_features)}')
print(f'Selected features:\n{sorted(selection)}')
print(f'Auto-selected features:\n{sorted(autoselection)}')

#### Visualise importance curve as more features are selected

In [None]:
paths = selector.lasso_path()
curve = np.cumsum(np.sort(paths.iloc[-1, :])[::-1])
plt.plot(np.arange(1, 1+len(curve)), curve)

#### Visualise lasso paths of relevant features

In [None]:
paths = selector.lasso_path()
paths /= paths.max().max()
paths.iloc[:, features].plot(figsize=(16, 10))