<a href="https://colab.research.google.com/github/tdubon/TF-Decision-Forest/blob/main/TF_Decision_Trees_Trained_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification with TF Decision Trees
Source code from https://keras.io/examples/structured_data/classification_with_tfdf/

In [None]:
!pip install huggingface_hub

In [None]:
!pip install -U tensorflow_decision_forests

In [None]:
!pip install tensorflow==2.7.0



In [None]:
!pip install ipykernel==4.10

In [None]:
!apt-get install -y git-lfs

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import push_to_hub_keras

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
import math
import urllib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_decision_forests as tfdf




In [None]:
input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income"
input_column_header = "income_level"


In [None]:
#Load data

BASE_PATH = input_path
CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_")
  for l in urllib.request.urlopen(f"{BASE_PATH}.names")
  if not l.startswith(b"|")][2:]

CSV_HEADER.append(input_column_header)

train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER)
test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER)

In [None]:
print(train_data.head)

In [None]:
#convert from string to integers
target_labels = [" - 50000.", " 50000+."]
train_data[input_column_header] = train_data[input_column_header].map(target_labels.index)
test_data[input_column_header] = test_data[input_column_header].map(target_labels.index)

In [None]:
#Observe shape of training and test data
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(train_data.head().T)

In [None]:
#define metadata

# Target column name.
TARGET_COLUMN_NAME = "income_level"
# Weight column name.
WEIGHT_COLUMN_NAME = "instance_weight"
# Numeric feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "wage_per_hour",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "weeks_worked_in_year",
]

# Categorical features and their vocabulary lists.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    feature_name: sorted(
        [str(value) for value in list(train_data[feature_name].unique())]
    )
    for feature_name in CSV_HEADER
    if feature_name
    not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME])
}
# All features names.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
    CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
)

Configure hyperparameters for the tree model.

In [None]:
GROWING_STRATEGY = "BEST_FIRST_GLOBAL"
NUM_TREES = 250
MIN_EXAMPLES = 6
MAX_DEPTH = 5
SUBSAMPLE = 0.65
SAMPLING_METHOD = "RANDOM"
VALIDATION_RATIO = 0.1

In [None]:
#Implement training & evaluation procedure
def prepare_sample(features, target, weight):
    for feature_name in features:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            if features[feature_name].dtype != tf.dtypes.string:
                # Convert categorical feature values to string.
                features[feature_name] = tf.strings.as_string(features[feature_name])
    return features, target, weight


def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):

    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)

    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy = model.evaluate(test_dataset, verbose=0)
    push_to_hub = True
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    

In [None]:
#Create model inputs

def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

## Decision Forests for Target Encoding
 

*   Convert categorical features using numerical encoding
*   Extract positive_frequency, negative_frequency, positive_probability




In [None]:
#Implement Binary Target Encoder
class BinaryTargetEncoding(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def adapt(self, data):
        # data is expected to be an integer numpy array to a Tensor shape [num_exmples, 2].
        # This contains feature values for a given feature in the dataset, and target values.

        # Convert the data to a tensor.
        data = tf.convert_to_tensor(data)
        # Separate the feature values and target values
        feature_values = tf.cast(data[:, 0], tf.dtypes.int64)
        target_values = tf.cast(data[:, 1], tf.dtypes.bool)

        print("Target encoding: Computing unique feature values...")
        # Get feature vocabulary.
        unique_feature_values = tf.sort(tf.unique(feature_values).y)

        print(
            "Target encoding: Computing frequencies for feature values with positive targets..."
        )
        # Filter the data where the target label is positive.
        positive_indices = tf.where(condition=target_values)
        postive_feature_values = tf.gather_nd(
            params=feature_values, indices=positive_indices
        )
        # Compute how many times each feature value occurred with a positive target label.
        positive_frequency = tf.math.unsorted_segment_sum(
            data=tf.ones(
                shape=(postive_feature_values.shape[0], 1), dtype=tf.dtypes.int32
            ),
            segment_ids=postive_feature_values,
            num_segments=unique_feature_values.shape[0],
        )

        print(
            "Target encoding: Computing frequencies for feature values with negative targets..."
        )
        # Filter the data where the target label is negative.
        negative_indices = tf.where(condition=tf.math.logical_not(target_values))
        negative_feature_values = tf.gather_nd(
            params=feature_values, indices=negative_indices
        )
        # Compute how many times each feature value occurred with a negative target label.
        negative_frequency = tf.math.unsorted_segment_sum(
            data=tf.ones(
                shape=(negative_feature_values.shape[0], 1), dtype=tf.dtypes.int32
            ),
            segment_ids=negative_feature_values,
            num_segments=unique_feature_values.shape[0],
        )

        print("Target encoding: Storing target encoding statistics...")
        self.positive_frequency_lookup = tf.constant(positive_frequency)
        self.negative_frequency_lookup = tf.constant(negative_frequency)

    def reset_state(self):
        self.positive_frequency_lookup = None
        self.negative_frequency_lookup = None

    def call(self, inputs):
        # inputs is expected to be an integer numpy array to a Tensor shape [num_exmples, 1].
        # This includes the feature values for a given feature in the dataset.

        # Raise an error if the target encoding statistics are not computed.
        if (
            self.positive_frequency_lookup == None
            or self.negative_frequency_lookup == None
        ):
            raise ValueError(
                f"You need to call the adapt method to compute target encoding statistics."
            )

        # Convert the inputs to a tensor.
        inputs = tf.convert_to_tensor(inputs)
        # Cast the inputs int64 a tensor.
        inputs = tf.cast(inputs, tf.dtypes.int64)
        # Lookup positive frequencies for the input feature values.
        positive_fequency = tf.cast(
            tf.gather_nd(self.positive_frequency_lookup, inputs),
            dtype=tf.dtypes.float32,
        )
        # Lookup negative frequencies for the input feature values.
        negative_fequency = tf.cast(
            tf.gather_nd(self.negative_frequency_lookup, inputs),
            dtype=tf.dtypes.float32,
        )
        # Compute positive probability for the input feature values.
        positive_probability = positive_fequency / (
            positive_fequency + negative_fequency
        )
        # Concatenate and return the looked-up statistics.
        return tf.concat(
            [positive_fequency, negative_fequency, positive_probability], axis=1
        )

In [None]:
#Test binary target encoder
data = tf.constant(
    [
        [0, 1],
        [2, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [2, 0],
        [1, 0],
        [0, 1],
        [2, 1],
        [1, 0],
        [0, 1],
        [2, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [2, 0],
        [1, 0],
        [0, 1],
        [2, 0],
    ]
)

binary_target_encoder = BinaryTargetEncoding()
binary_target_encoder.adapt(data)
print(binary_target_encoder([[0], [1], [2]]))

Target encoding: Computing unique feature values...
Target encoding: Computing frequencies for feature values with positive targets...
Target encoding: Computing frequencies for feature values with negative targets...
Target encoding: Storing target encoding statistics...
tf.Tensor(
[[6.         0.         1.        ]
 [4.         3.         0.5714286 ]
 [1.         5.         0.16666667]], shape=(3, 3), dtype=float32)


In [None]:
#Implement a feature encoding with target encoding

def create_target_encoder():
    inputs = create_model_inputs()
    target_values = train_data[[TARGET_COLUMN_NAME]].to_numpy()
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_indices = lookup(inputs[feature_name])
            # Prepare the data to adapt the target encoding.
            print("### Adapting target encoding for:", feature_name)
            feature_values = train_data[[feature_name]].to_numpy().astype(str)
            feature_value_indices = lookup(feature_values)
            data = tf.concat([feature_value_indices, target_values], axis=1)
            feature_encoder = BinaryTargetEncoding()
            feature_encoder.adapt(data)
            # Convert the feature value indices to target encoding representations.
            encoded_feature = feature_encoder(tf.expand_dims(value_indices, -1))
        else:
            # Expand the dimensions of the numerical input feature and use it as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        # Add the encoded feature to the list.
        encoded_features.append(encoded_feature)
    # Concatenate all the encoded features.
    encoded_features = tf.concat(encoded_features, axis=1)
    # Create and return a Keras model with encoded features as outputs.
    return keras.Model(inputs=inputs, outputs=encoded_features)

In [None]:
#Gradient Boosted Tree Model with preprocessor

def create_gbt_with_preprocessor(preprocessor):

    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        preprocessing=preprocessor,
        growing_strategy=GROWING_STRATEGY,
        num_trees=NUM_TREES,
        max_depth=MAX_DEPTH,
        min_examples=MIN_EXAMPLES,
        subsample=SUBSAMPLE,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
    )

    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])

    return gbt_model

In [None]:
#Train and evaluate the model

gbt_model = create_gbt_with_preprocessor(create_target_encoder())
run_experiment(gbt_model, train_data, test_data)

# Decision Forests with trained embeddings

1.   Train the embedding encoder with linear model with backprop
2.   Use encoder to preprocess input features of GB Tree


In [None]:
#Implement feature encoding with embeddings
def create_embedding_encoder():
    inputs = create_model_inputs()
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            # Create an embedding layer with the specified dimensions
            vocabulary_size = len(vocabulary)
            embedding_size = int(math.sqrt(vocabulary_size))
            feature_encoder = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_size
            )
            # Convert the index values to embedding representations.
            encoded_feature = feature_encoder(value_index)
        else:
            # Expand the dimensions of the numerical input feature and use it as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        # Add the encoded feature to the list.
        encoded_features.append(encoded_feature)
    # Concatenate all the encoded features.
    encoded_features = layers.concatenate(encoded_features, axis=1)
    # Create and return a Keras model with encoded features as outputs.
    return keras.Model(inputs=inputs, outputs=encoded_features)

In [None]:
#Build linear model to train embeddings
def create_linear_model(encoder):
    inputs = create_model_inputs()
    embeddings = encoder(inputs)
    linear_output = layers.Dense(units=1, activation="sigmoid")(embeddings)

    linear_model = keras.Model(inputs=inputs, outputs=linear_output)
    linear_model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy("accuracy")],
    )
    return linear_model


embedding_encoder = create_embedding_encoder()
run_experiment(
    create_linear_model(embedding_encoder),
    train_data,
    test_data,
    num_epochs=3,
    batch_size=256,
)

# Train and evaluate GB Tree model with embeddings

In [None]:
gbt_model = create_gbt_with_preprocessor(embedding_encoder)
run_experiment(gbt_model, train_data, test_data)

In [None]:
gbt_model.get_weights()