In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
import category_encoders as ce
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from hisel import select
from hisel.select import HSICSelector, FeatureType



In [None]:
from tests.select_test import pyhsiclasso

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../tests/data/featvector_training_example.csv')

### Data preprocessing

1. Drop columns whose entries are all NaN

In [None]:
cnt = df.isna().sum()
idxallna = cnt > .95 * len(df)
allnacols = list(cnt.loc[idxallna].index)
df.drop(columns = allnacols, inplace=True)
print(f'Dimensionality of dataset after dropping: {df.shape}')

2. Drop rows that don't have `sum_volume_1m_cc`

In [None]:
idxnov1m = df['sum_volume_1m_cc'].isna()
df = df.loc[~idxnov1m, :]
print(f'Dimensionality of dataset after dropping: {df.shape}')

3. Fill NaNs

In [None]:
print(f'20 features with the most NaNs:\n{df.isna().sum().sort_values(ascending=False).head(20)}')
df = df.fillna(.0)

4. Ordinal encoding of categorical features

In [None]:
cattypes = ['object'] 
catcols = list(df.select_dtypes(include=cattypes).columns)
ordinal_encoder = ce.OrdinalEncoder(cols=catcols)
ordinal_encoder.fit(df)
df = ordinal_encoder.transform(df)

5. Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(
        columns = [
            'guest_ltv_3m'
        ],
        axis=1
    ),
    df[['guest_ltv_3m']],
    test_size=0.3,
    random_state=40
)

In [None]:
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_test.shape: {y_test.shape}')

In [None]:
# df_ = df
# ydf = df_[['guest_ltv_3m']].copy()
# xdf = df_.drop(columns = ['guest_ltv_3m']).astype(float)
# x = xdf.values
# y = ydf.values

In [None]:
# pyhsiclasso_selection = pyhsiclasso(
#     x, 
#     y, 
#     xfeattype=FeatureType.CONT, 
#     yfeattype=FeatureType.DISCR,
#     n_features=100,
#     batch_size=400
# )

In [None]:
# print(sorted(pyhsiclasso_selection))

In [None]:
ydf = df[['guest_ltv_3m']]
xdf = df.drop(columns = ['guest_ltv_3m'])

In [None]:
batch_size = len(df)
minibatch_size = 200
number_of_epochs = 3
mi_threshold = .09
hsic_threshold = .01
device = None # run on CPU

In [None]:
selection = select.select(
    xdf,
    ydf,
    mi_threshold=mi_threshold,
    hsic_threshold=hsic_threshold,
    batch_size=batch_size,
    minibatch_size=minibatch_size,
    number_of_epochs=number_of_epochs,
    device=device,
)

In [None]:
curve = selection.regcurve
plt.plot(np.arange(1, 1+len(curve)), curve)

In [None]:
paths = selection.lassopaths
paths.plot(figsize=(18, 12))

In [None]:
selection.mi_ordered_features

In [None]:
selection.hsic_ordered_features