# Titanic and boosted trees

Following https://www.tensorflow.org/tutorials/estimator/boosted_trees. Will then try to do it differently

In [1]:
import numpy as np
import pandas as pd

import IPython.display as ipyd

import matplotlib as mpl
import matplotlib.pyplot as pp
import matplotlib.cm as cm

import typing as tp

import tensorflow as tf
tf.random.set_seed(42)

## Load dataset and define schema (feature columns)

In [2]:
TRAIN_DF = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
EVAL_DF = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')

total_count = len(TRAIN_DF) + len(EVAL_DF)

print(f'Train size = {len(TRAIN_DF)} ({100*len(TRAIN_DF)/total_count:.1f}%) of the full number of records')

LABEL_NAME = 'survived'
TRAIN_LABEL = TRAIN_DF.pop(LABEL_NAME)
#
EVAL_LABEL = EVAL_DF.pop(LABEL_NAME)

Train size = 627 (70.4%) of the full number of records


In [3]:
TRAIN_DF.head(3)

Unnamed: 0,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,female,26.0,0,0,7.925,Third,unknown,Southampton,y


In [4]:
CATEGORICAL_COLUMNS = [
    'sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone'
]
#
NUMERIC_COLUMNS = ['age', 'fare']

#######

def one_hot_cat_column(
    feature_name: str, 
    vocab_list: tp.List[tp.Union[str, int]]
)->tf.feature_column.indicator_column:
    """
    Given the name of the categorical feature and its vocabulary use one-hot
    encoding to convert this feature to indicator_column
    """
    
    # basic wrapper for one-hot encoded column
    cat_feat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature_name,
        vocabulary_list=vocab_list
    )
    
    # multi-hot representation. not quite sure why this extra layer is needed
    return tf.feature_column.indicator_column(cat_feat_col)

#######

FEATURE_COLUMNS = [] # list of feature columns, essentially schema

# add categorical columns
for fn in CATEGORICAL_COLUMNS:
    cur_vocab = TRAIN_DF[fn].unique()
    FEATURE_COLUMNS.append(one_hot_cat_column(fn, cur_vocab))
    
# add numeric columns
for fn in NUMERIC_COLUMNS:
    FEATURE_COLUMNS.append(tf.feature_column.numeric_column(fn, dtype=tf.float32))

## Data ingestion

Suitable for small datasets.

In [5]:
NUM_EXAMPLES = len(TRAIN_LABEL)

#######

def make_input_fn(
    features_df:  pd.DataFrame,
    labels_srs:   pd.Series,
    epoch_count:  tp.Optional[int]=None,
    shuffle:      tp.Optional[bool]=True,
    num_examples: tp.Optional[int]=NUM_EXAMPLES
    #
)->tp.Callable[[], tf.data.Dataset]:
    ###
    def input_fn()->tf.data.Dataset:
        # dict(TRAIN_DF) essentially converts dataframe into {column_name: column-value-list} dictonary
        # createing dataset like this means that the basic records of the dataset will 
        # be tuples of: (<feature values>, <label_value>)
        dataset = tf.data.Dataset.from_tensor_slices( (dict(TRAIN_DF), TRAIN_LABEL) )
        
        if shuffle:
            dataset = dataset.shuffle(buffer_size=num_examples)
            
        # setting epochs to none will keep to indefinte cycling
        # setting to 1 will lead to going over data once
        # former is better for training, latter for eval/test
        dataset = dataset.repeat(count=epoch_count)
            
        # batch data to have multiple rows included as one
        # in the dataset
        dataset = dataset.batch(num_examples)
        
        return dataset
        
    ###
    
    return input_fn
    
# prepare training and eval data functions
TRAIN_INPUT_FN = make_input_fn(TRAIN_DF, TRAIN_LABEL)
EVAL_INPUT_FN = make_input_fn(EVAL_DF, EVAL_LABEL, shuffle=False, epoch_count=1)

## Linear classifier

## Boosted trees classifier

In [6]:
N_BATCHES = 1 
BT_EST = tf.estimator.BoostedTreesClassifier(FEATURE_COLUMNS, n_batches_per_layer=N_BATCHES)

# train
BT_EST.train(TRAIN_INPUT_FN, max_steps=100)

# evaluate result
BT_RESULT = BT_EST.evaluate(EVAL_INPUT_FN)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpx4hmmmt4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs in

In [21]:
print(BT_RESULT)

{'accuracy': 0.93939394, 'accuracy_baseline': 0.6124402, 'auc': 0.97687864, 'auc_precision_recall': 0.9722654, 'average_loss': 0.21375903, 'label/mean': 0.3875598, 'loss': 0.21375903, 'precision': 0.95555556, 'prediction/mean': 0.38759562, 'recall': 0.8847737, 'global_step': 100}


### Boosted Trees Algorithm

https://arxiv.org/abs/1710.11555
https://medium.com/tensorflow/how-to-train-boosted-trees-models-in-tensorflow-ca8466a53127
