# TODO : 
- como pasar al embedding que las categoricas de la misma columna son del mismo grupo(?)

In [6]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sys

sys.path.append('../')

from src.models import build_tab_transformer, build_tab_transformer_v2, build_tab_transformer_v3

from src.layers import CategoricalFeatureEmbedding, NumericalFeatureEmbedding, FeatureEmbedding, TransformerEncoder

In [1]:
# Load a Dataset

# https://epistasislab.github.io/
from pmlb import fetch_data

data = fetch_data('adult')

In [2]:
# Prepare Dataset
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


target_column = 'target'
categorical_features = data.drop(columns=[target_column]).select_dtypes(include=np.int64).columns.to_list()
numeric_features = data.select_dtypes(include=np.float64).columns.to_list()

features = data[numeric_features+categorical_features]
target = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
categorical_transformer = OrdinalEncoder(dtype=np.int64,handle_unknown="use_encoded_value",unknown_value=999)

preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    verbose_feature_names_out=True,
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [3]:
X_train_df = pd.DataFrame(X_train_transformed,columns = [name.split("__")[1] for name in preprocessor.get_feature_names_out()])
X_test_df = pd.DataFrame(X_test_transformed,columns = [name.split("__")[1] for name in preprocessor.get_feature_names_out()])

In [9]:
def build_baseline_model(categories,num_continuous):
    num_categories = len(categories)
    cat_input = keras.layers.Input(shape=(num_categories,), name='cat_inputs')
    num_input = keras.layers.Input(shape=(num_continuous,), name='num_inputs')
    x=keras.layers.Dense(256, input_shape=(num_continuous,), activation='relu')(num_input)
    x=keras.layers.Dense(128, input_shape=(num_continuous,), activation='relu')(x)
    x=keras.layers.Dense(64, input_shape=(num_continuous,), activation='relu')(x)
    x=keras.layers.Dense(32, activation='relu')(x)
    outputs = keras.layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=[num_input, cat_input], outputs=outputs)
    
    return model

In [19]:
tf.keras.backend.clear_session()

categories = preprocessor.transformers_[1][1].categories_
categories = [len(c)+1 for c in categories]
num_continuous = len(numeric_features)   

# model = build_tab_transformer(categories,num_continuous, dim=32,
#             dim_out=1, depth=6, heads=8, dense_dim=16, mlp_hidden_mults = (4, 2),
#             mlp_act = keras.layers.ReLU(),attn_dropout = 0.1,ff_dropout = 0.1)

# base_model = build_baseline_model(categories,num_continuous)

modelv2 = build_tab_transformer_v2(categories,num_continuous, dim=32,
            dim_out=1, depth=6, heads=8, ff_dim=16, mlp_hidden_mults = (4, 2),
            mlp_act = keras.layers.ReLU(),attn_dropout = 0.1,ff_dropout = 0.1)



In [8]:
modelv2.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cat_inputs (InputLayer)        [(None, 8)]          0           []                               
                                                                                                  
 categorical_feature_embedding   (None, 8, 32)       3488        ['cat_inputs[0][0]']             
 (CategoricalFeatureEmbedding)                                                                    
                                                                                                  
 transformer_encoder (Transform  (None, 8, 32)       34768       ['categorical_feature_embedding[0
 erEncoder)                                                      ][0]']                           
                                                                                              

In [9]:
# model(X_train.values)
import tensorflow as tf

def map_func(x,y):
    return {'num_inputs': x[0],'cat_inputs': x[1]}, y
    
X_ds = tf.data.Dataset.from_tensor_slices((X_train[numeric_features].values, X_train[categorical_features].values))
y_ds = tf.data.Dataset.from_tensor_slices(y_train.values)
train_ds = tf.data.Dataset.zip((X_ds, y_ds))

X_val_ds = tf.data.Dataset.from_tensor_slices((X_test[numeric_features].values,X_test[categorical_features].values))
y_val_ds = tf.data.Dataset.from_tensor_slices(y_test.values)
val_ds = tf.data.Dataset.zip((X_val_ds, y_val_ds))


batch_size = 256
train_ds = train_ds.map(map_func).batch(batch_size)
val_ds = val_ds.map(map_func).batch(batch_size)

In [11]:


def compile_and_fit_model(model, train_ds, val_ds, epochs=1, lr=1e-4):
    opt = keras.optimizers.Adam(learning_rate=lr)
    loss = tf.keras.losses.CategoricalCrossentropy()
    metrics = []
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

    early_stop_callback = keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5
    )
    callbacks = [early_stop_callback]

    metrics = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.Accuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
    ]

    history = model.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=callbacks, metrics=metrics)
    return history

In [20]:
compile_and_fit_model(modelv2, train_ds, val_ds)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40


<keras.callbacks.History at 0x1f14d7376d0>

In [22]:
modelv2.save("../models/tab-transformer-v2-21_epochs")





INFO:tensorflow:Assets written to: ../models/tab-transformer-v2-21_epochs\assets


INFO:tensorflow:Assets written to: ../models/tab-transformer-v2-21_epochs\assets


In [556]:
opt = keras.optimizers.Adam(learning_rate=1e-5, decay=1e-3 / 200)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

early_stop_callback = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5
)

model.fit(train_ds, validation_data=val_ds, epochs=40, callbacks=[early_stop_callback])


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1948275ff40>

In [None]:
model.save

In [557]:
model.save("../models/tab-transformer-40_epochs")



INFO:tensorflow:Assets written to: ../models/tab-transformer-40_epochs\assets


INFO:tensorflow:Assets written to: ../models/tab-transformer-40_epochs\assets


In [559]:
# model.summary()

loaded_model = tf.keras.models.load_model('../models/tab-transformer-40_epochs')


In [560]:
# base_preds = base_model.predict(x=[X_test[numeric_features].values,X_test[categorical_features].values])

loaded_model.predict(x=[X_test[numeric_features].values,X_test[categorical_features].values])

# preds = model(inputs=[X_test[numeric_features].values,X_test[categorical_features].values])




array([[0.09662233],
       [0.7259842 ],
       [0.6931739 ],
       ...,
       [0.71905434],
       [0.9878825 ],
       [0.9762927 ]], dtype=float32)

In [533]:
# import matplotlib.pyplot as plt

# plt.plot(base_preds)
# plt.show()

base_preds[base_preds==1]

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [561]:
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer.get_vocabulary()

['[UNK]', 'd', 'z', 'c', 'b', 'a']

In [576]:
# layer(data)

df_cat_std = X_train[categorical_features]

In [577]:
for col in df_cat_std.columns:
    df_cat_std.loc[:,col]=df_cat_std.loc[:,col].apply(lambda x: col+"_"+str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_std.loc[:,col]=df_cat_std.loc[:,col].apply(lambda x: col+"_"+str(x))


In [580]:

data = df_cat_std.values
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer.get_vocabulary()

['[UNK]',
 'native-country_39',
 'race_4',
 'workclass_4',
 'sex_1',
 'marital-status_2',
 'relationship_0',
 'sex_0',
 'marital-status_4',
 'education_11',
 'relationship_1',
 'education_15',
 'education_9',
 'relationship_3',
 'marital-status_0',
 'occupation_10',
 'occupation_3',
 'occupation_4',
 'occupation_1',
 'occupation_12',
 'relationship_4',
 'occupation_8',
 'race_2',
 'workclass_6',
 'workclass_2',
 'occupation_7',
 'occupation_0',
 'workclass_0',
 'education_12',
 'occupation_14',
 'relationship_5',
 'occupation_6',
 'education_8',
 'workclass_7',
 'education_1',
 'workclass_5',
 'education_7',
 'marital-status_5',
 'marital-status_6',
 'race_1',
 'relationship_2',
 'occupation_5',
 'occupation_13',
 'workclass_1',
 'education_0',
 'occupation_11',
 'education_5',
 'native-country_26',
 'native-country_0',
 'education_14',
 'education_6',
 'education_2',
 'marital-status_3',
 'education_10',
 'education_4',
 'race_0',
 'race_3',
 'native-country_30',
 'education_3',
 'occ

In [588]:

print(layer(data[:3]))
print(data[:3])

tf.Tensor(
[[ 3 34  8 19 13  2  7  1]
 [ 3  9  5 18 30  2  7  1]
 [27 12  5 26  6  2  4  1]], shape=(3, 8), dtype=int64)
[['workclass_4' 'education_1' 'marital-status_4' 'occupation_12'
  'relationship_3' 'race_4' 'sex_0' 'native-country_39']
 ['workclass_4' 'education_11' 'marital-status_2' 'occupation_1'
  'relationship_5' 'race_4' 'sex_0' 'native-country_39']
 ['workclass_0' 'education_9' 'marital-status_2' 'occupation_0'
  'relationship_0' 'race_4' 'sex_1' 'native-country_39']]
