In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sys
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

sys.path.append('../')

In [3]:
from src.layers import (
    NumericalFeatureEmbedding,
    CategoricalFeatureEmbedding, 
    FeatureEmbedding, 
    TransformerEncoder, 
    MLP
)

### Numerical Features Embeddings

In [29]:

num_features = 2
dim_token = 6

x_num = np.random.random(size=(100, 2))

d_sqrt_inv = 1 / np.sqrt(dim_token)

# Learnable weights
w = np.random.uniform(low=-d_sqrt_inv, high=d_sqrt_inv, size=(num_features,dim_token))
b = np.random.uniform(low=-d_sqrt_inv, high=d_sqrt_inv, size=(num_features,dim_token))

output = w[np.newaxis] *x_num[...,np.newaxis] + b

print("weights shape: ",w[np.newaxis].shape)
print("x shape: ",x_num[...,np.newaxis].shape)
print("output shape: ",(w[np.newaxis] *x_num[...,np.newaxis]).shape)

weights shape:  (1, 2, 6)
x shape:  (100, 2, 1)
output shape:  (100, 2, 6)


In [33]:
from sklearn.datasets import make_classification

num_features = 10
dim_token = 32

X, y = make_classification(
    n_samples=1000, n_features=num_features, 
    n_informative=num_features, n_redundant=0, 
    n_repeated=0, n_classes=2, 
    n_clusters_per_class=2,
    shuffle=True, random_state=123
)

ne = NumericalFeatureEmbedding(num_features=num_features,dim_token=dim_token)

ne(X)

In [None]:
# Basic MLP classification with NumericalFeatureEmbedding 



### Categorical Features Embeddings

In [45]:
cardinalities = [5, 10, 20, 10,5]
num_features = len(cardinalities)
dim_token = 6

x_cat = np.concatenate([np.random.randint(low=0,high=c,size=(100,1)) for c in cardinalities], axis=1)

offsets = np.cumsum([0] + cardinalities[:-1], axis=0)

total_tokens = sum(cardinalities)

# # Learnable weights
d_sqrt_inv = 1 / np.sqrt(dim_token)

emb = keras.layers.Embedding(input_dim=total_tokens, output_dim=dim_token)
b = np.random.uniform(low=-d_sqrt_inv, high=d_sqrt_inv, size=(num_features,dim_token))
output = emb(x_cat + offsets) + b

print("input shape: ",x_num.shape)
print("output shape: ",output.shape)

input shape:  (100, 2)
output shape:  (100, 5, 6)


In [119]:

cardinalities = [3, 5, 10, 15, 20, 25, 30, 35]
num_features = len(cardinalities)
dim_token = 32

X, y = make_classification(
    n_samples=1000, n_features=num_features, 
    n_informative=num_features, n_redundant=0, 
    n_repeated=0, n_classes=2, 
    n_clusters_per_class=2,
    shuffle=True, random_state=123
)

# convert continuous to cat with quantils
X_cat = np.concatenate([
    np.digitize(X[:,i], bins=np.quantile(X[:,i], np.linspace(0,1,c+1))[1:-1])[...,np.newaxis]
    for i, c in enumerate(cardinalities)
], axis=1)


ne = CategoricalFeatureEmbedding(cardinalities=cardinalities,dim_token=dim_token)

ne(X_cat)

<tf.Tensor: shape=(1000, 8, 32), dtype=float32, numpy=
array([[[-0.21157132,  0.06224575, -0.16173288, ..., -0.21423268,
         -0.12774879, -0.15862733],
        [-0.09974983, -0.07074656, -0.06262894, ...,  0.08302263,
          0.2019066 , -0.09884878],
        [-0.15838726,  0.02515485, -0.07299703, ..., -0.26734942,
         -0.06252657,  0.03538641],
        ...,
        [ 0.08510815,  0.21257526, -0.03557715, ..., -0.15214893,
          0.12951714, -0.17795222],
        [ 0.15064844,  0.27025256,  0.04708365, ...,  0.02033691,
         -0.02297193,  0.05043186],
        [ 0.10367307, -0.12014414, -0.26733568, ..., -0.068225  ,
         -0.01493998,  0.04364815]],

       [[-0.08266047,  0.23557323, -0.01562801, ..., -0.05817087,
         -0.23796213,  0.05267194],
        [-0.22358014,  0.01417179, -0.08427122, ..., -0.00529899,
         -0.11553805, -0.02467071],
        [-0.24210581,  0.16111772, -0.09392573, ...,  0.03025427,
         -0.02894174,  0.10917954],
        ...,

In [None]:
# Basic MLP classification with CategoricalFeatureEmbedding 