In [1]:
import numpy as np
import pandas as pd
from tablebench.core.discretization import KBinsDiscretizer
from sklearn.preprocessing import KBinsDiscretizer as SKD


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
rnd = np.random.RandomState(42)
X = rnd.uniform(-3, 3, size=20).reshape(-1,1)

def _vc(x):
    return pd.Series(x.ravel()).value_counts().sort_index()

In [6]:
# Transform the dataset with sklearn KBinsdiscretizer
enc_baseline = SKD(n_bins=10, encode="ordinal")
X_baseline = enc_baseline.fit_transform(X)
_vc(X_baseline)

BIN EDGES: [array([-2.87649303, -2.12277942, -1.94001779, -1.77804762, -1.20577799,
        -0.58054459,  0.32590352,  0.79921369,  1.51250209,  2.24777977,
         2.81945911])                                                   ]


0.0    2
1.0    2
2.0    2
3.0    2
4.0    2
5.0    2
6.0    2
7.0    2
8.0    2
9.0    2
dtype: int64

In [7]:
# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=10, encode="ordinal")
X_binned = enc.fit_transform(X)
_vc(X_binned)

0.0    2
1.0    2
2.0    2
3.0    2
4.0    2
5.0    2
6.0    2
7.0    2
8.0    2
9.0    2
dtype: int64

In [8]:
# A version of the data with some nan values.
# This should have a similar distribution, but also '-1' values.
Xm = rnd.uniform(-3, 3, size=100).reshape(-1,1)
Xm[:9] = np.nan

encm = KBinsDiscretizer(n_bins=10, encode="ordinal")
Xm_binned = enc.fit_transform(Xm)

_vc(Xm_binned)

-1.0     9
 0.0     9
 1.0     9
 2.0     9
 3.0     9
 4.0     9
 5.0     9
 6.0     9
 7.0     9
 8.0     9
 9.0    10
dtype: int64

In [9]:
# Check the binned outputs vs. the sorted raw values, and ensure that nan are binned as -1.
pd.options.display.max_rows=101
pd.DataFrame(np.column_stack((Xm, Xm_binned))).sort_values(by=0)

Unnamed: 0,0,1
52,-2.966867,0.0
78,-2.847485,0.0
80,-2.811425,0.0
22,-2.793669,0.0
38,-2.728636,0.0
9,-2.721298,0.0
63,-2.61865,0.0
12,-2.60969,0.0
57,-2.555732,0.0
48,-2.552696,1.0


In [11]:
# test an all-nan vector
A = np.full(shape=(10,), fill_value=np.nan).reshape((-1,1))
enc = KBinsDiscretizer(n_bins=10, encode="ordinal")
A_binned = enc.fit_transform(A)

_vc(A_binned)



-1.0    10
dtype: int64

In [None]:
# test a constant vector