<a href="https://colab.research.google.com/github/tdubon/Forest_Cover/blob/master/TFDecisionTrees_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification with TF Decision Trees
Source code from https://keras.io/examples/structured_data/classification_with_tfdf/

In [None]:
!pip install huggingface_hub

In [14]:
!pip install numpy==1.20



In [None]:
!pip install folium==0.2.1

In [None]:
!pip install imgaug==0.2.6

In [None]:
!pip install tensorflow==2.8.0

In [None]:
!pip install -U tensorflow_decision_forests

In [None]:
!pip install ipykernel==4.10

In [None]:
!apt-get install -y git-lfs

In [None]:
!pip install wurlitzer 

In [21]:
from huggingface_hub import notebook_login
from huggingface_hub.keras_mixin import push_to_hub_keras

In [12]:
notebook_login()

ERROR:root:HfApi.login: This method is deprecated in favor of `set_access_token`.


Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [1]:
import math
import urllib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_decision_forests as tfdf
import os
import tempfile




In [40]:
tmpdir = tempfile.mkdtemp()

In [2]:
try:
  from wurlitzer import sys_pipes
except:
  from colabtools.googlelog import CaptureLog as sys_pipes

In [3]:
input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income"
input_column_header = "income_level"


In [4]:
#Load data

BASE_PATH = input_path
CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_")
  for l in urllib.request.urlopen(f"{BASE_PATH}.names")
  if not l.startswith(b"|")][2:]

CSV_HEADER.append(input_column_header)

train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER)
test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER)

In [5]:
train_data["migration_code-change_in_msa"] = train_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)

In [6]:
test_data["migration_code-change_in_msa"] = test_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)

In [7]:
print(train_data["migration_code-change_in_msa"].unique())

['Unansw' ' MSA to MSA' ' Nonmover' ' NonMSA to nonMSA' ' Not in universe'
 ' Not identifiable' ' Abroad to MSA' ' MSA to nonMSA' ' Abroad to nonMSA'
 ' NonMSA to MSA']


In [8]:
for i, value in enumerate(CSV_HEADER):
  if value == "fill_inc_questionnaire_for_veteran's_admin":
    CSV_HEADER[i] = "fill_inc_veterans_admin"
  elif value == "migration_code-change_in_msa":
    CSV_HEADER[i] = "migration_code_chx_in_msa"
  elif value == "migration_code-change_in_reg":
    CSV_HEADER[i] = "migration_code_chx_in_reg"
  elif value == "migration_code-move_within_reg":
    CSV_HEADER[i] = "migration_code_move_within_reg"

In [9]:
#inspect the classes of the label, the input_column_header in this case
classes = train_data["income_level"].unique().tolist()
print(f"Label classes: {classes}")

Label classes: [' - 50000.', ' 50000+.']


In [10]:
#rename columns containing invalid characters
train_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})
test_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})

In [12]:
#convert from string to integers
# This stage is necessary if your classification label is represented as a
# string. Note: Keras expected classification labels to be integers.
target_labels = [" - 50000.", " 50000+."]
train_data[input_column_header] = train_data[input_column_header].map(target_labels.index)
test_data[input_column_header] = test_data[input_column_header].map(target_labels.index)

In [None]:
#Observe shape of training and test data
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(train_data.head().T)

In [14]:
#define metadata

# Target column name.
TARGET_COLUMN_NAME = "income_level"
# Weight column name.
WEIGHT_COLUMN_NAME = "instance_weight"
# Numeric feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "wage_per_hour",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "weeks_worked_in_year",
]

# Categorical features and their vocabulary lists.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    feature_name: sorted(
        [str(value) for value in list(train_data[feature_name].unique())]
    )
    for feature_name in CSV_HEADER
    if feature_name
    not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME])
}
# All features names.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
    CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
)

Configure hyperparameters for the tree model.

In [15]:
GROWING_STRATEGY = "BEST_FIRST_GLOBAL"
NUM_TREES = 250
MIN_EXAMPLES = 6
MAX_DEPTH = 5
SUBSAMPLE = 0.65
SAMPLING_METHOD = "RANDOM"
VALIDATION_RATIO = 0.1

In [16]:
#Implement training & evaluation procedure
def prepare_sample(features, target, weight):
    for feature_name in features:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            if features[feature_name].dtype != tf.dtypes.string:
                # Convert categorical feature values to string.
                features[feature_name] = tf.strings.as_string(features[feature_name])
    return features, target, weight


def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):

    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label="income_level", weight="instance_weight"
    ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label="income_level", weight="instance_weight"
    ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)

    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy = model.evaluate(test_dataset, verbose=0)
    push_to_hub = True
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    

In [17]:
#Create model inputs

def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

# Experiment 1: Decision Forests with raw features

In [18]:
#Decision Forest with raw features
def specify_feature_usages(inputs):
    feature_usages = []

    for feature_name in inputs:
        if inputs[feature_name].dtype == tf.dtypes.float32:
            feature_usage = tfdf.keras.FeatureUsage(
                name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
            )
        else:
            feature_usage = tfdf.keras.FeatureUsage(
                name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
            )

        feature_usages.append(feature_usage)
    return feature_usages
  

In [19]:
#Create GB trees model
def create_gbt_model():
    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        features = specify_feature_usages(create_model_inputs()),
        exclude_non_specified_features = True,
        growing_strategy = GROWING_STRATEGY,
        num_trees = NUM_TREES,
        max_depth = MAX_DEPTH,
        min_examples = MIN_EXAMPLES,
        subsample = SUBSAMPLE,
        validation_ratio = VALIDATION_RATIO,
        task = tfdf.keras.Task.CLASSIFICATION,
        loss = "DEFAULT",
    )

    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
    return gbt_model 

In [None]:
#Train and evaluate model
gbt_model = create_gbt_model()
run_experiment(gbt_model, train_data, test_data)

In [None]:
#Inspect the model: Model type, mask, input features, feature importance
print(gbt_model.summary())

In [None]:
inspector = gbt_model.make_inspector()
[field for field in dir(inspector) if not field.startswith("_")]

In [None]:
#plot the model
tfdf.model_plotter.plot_model_in_colab(gbt_model, tree_idx=0, max_depth=3)

In [None]:
#display variable importance
inspector.variable_importances()

In [None]:
print("Model type:", inspector.model_type())
print("Number of trees:", inspector.num_trees())
print("Objective:", inspector.objective())
print("Input features:", inspector.features())

In [None]:
inspector.features()

In [45]:
#save_path = os.path.join(tmpdir, "raw/1/")
gbt_model.save("/Users/tdubon/TF_Model")



INFO:tensorflow:Assets written to: /Users/tdubon/assets


INFO:tensorflow:Assets written to: /Users/tdubon/assets


# Creating HF Space

In [29]:
from huggingface_hub import KerasModelHubMixin
from huggingface_hub.keras_mixin import push_to_hub_keras
push_to_hub_keras(gbt_model, repo_url="https://huggingface.co/keras-io/TF_Decision_Trees")

Cloning https://huggingface.co/keras-io/TF_Decision_Trees into local empty directory.


INFO:tensorflow:Assets written to: TF_Decision_Trees/assets


INFO:tensorflow:Assets written to: TF_Decision_Trees/assets


Upload file saved_model.pb:   1%|          | 3.39k/532k [00:00<?, ?B/s]

Upload file assets/gradient_boosted_trees_header.pb:  36%|###5      | 3.39k/9.45k [00:00<?, ?B/s]

Upload file assets/data_spec.pb:  25%|##4       | 3.39k/13.8k [00:00<?, ?B/s]

Upload file assets/header.pb: 100%|##########| 1.98k/1.98k [00:00<?, ?B/s]

Upload file keras_metadata.pb:  22%|##1       | 3.39k/15.4k [00:00<?, ?B/s]

To https://huggingface.co/keras-io/TF_Decision_Trees
   2f13260..9b5e7cb  main -> main

   2f13260..9b5e7cb  main -> main



'https://huggingface.co/keras-io/TF_Decision_Trees/commit/9b5e7cbcae57c3091b6c498aceb9e59eb47ec724'

In [None]:
#Clone and configure
!git clone https://tdubon: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@huggingface.co/tdubon/TF_Decision_Trees

!cd TFClassificationForest
!git config --global user.email "tdubon6@gmail.com"
# Tip: using the same email than for your huggingface.co account will link your commits to your profile
!git config --global user.name "tdubon"

In [None]:
!git add .
!git commit -m "Initial commit"
!git push

In [None]:
tf.keras.models.save_model(
    gbt_model, "/Users/tdubon/TFClassificationForest", overwrite=True, include_optimizer=True, save_format=None,
    signatures=None, options=None, save_traces=True)

In [None]:
gbt_model.make_inspector().export_to_tensorboard("/tmp/tb_logs/model_1")

%load_ext tensorboard
%tensorboard --logdir "/tmp/tb_logs"