In [1]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras, feature_column
from tensorflow.keras import layers

# Helper libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

tf.keras.backend.set_floatx('float64')
print(tf.__version__)

2.0.0


In [2]:
X = pd.read_csv("../data/train_values.csv", index_col = 'building_id')
y = pd.read_csv("../data/train_labels.csv", index_col = 'building_id').pop('damage_grade')

# Adjust percentage value to range [0, 1]
for header in ["area_percentage", "height_percentage"]:
    X[header] = X[header] / 100.
# Adjust label to range [0, 2]
y -= 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [3]:
print(len(X_train), len(y_train), 'training examples')
print(len(X_test), len(y_test), 'test examples')

208480 208480 training examples
52121 52121 test examples


In [138]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, labels, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [139]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['geo_level_3_id'])
  print('A batch of targets:', label_batch )

Every feature: ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'legal_ownership_status', 'count_families', 'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police

In [140]:
feature_columns = []

# numeric cols
for header in ["area_percentage",
              "height_percentage"]:
    feature_columns.append(feature_column.numeric_column(header))

# numeric embedding cols
for (header, max_value) in [
    ("age", 995),
    ("count_floors_pre_eq", 9),
    ("count_families", 9),
    ("geo_level_1_id", 30),
    ("geo_level_2_id", 1427),
    ("geo_level_3_id", 12567)]:
    one_hot = feature_column.categorical_column_with_identity(
      header, num_buckets=max_value+2, default_value=max_value+1)
    embedding = feature_column.embedding_column(one_hot, dimension=4)
    feature_columns.append(embedding)

# binary cols
for header in ['has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
         'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']:
    one_hot = feature_column.categorical_column_with_identity(
        header, num_buckets=2, default_value=0)
    feature_columns.append(feature_column.indicator_column(one_hot))


# categorical embedding cols
for header in ["land_surface_condition", "foundation_type",
                "roof_type", "ground_floor_type", "other_floor_type",
                "position", "plan_configuration", 'legal_ownership_status']:
    one_hot = feature_column.categorical_column_with_hash_bucket(header, hash_bucket_size=100)
    embedding = feature_column.embedding_column(one_hot, dimension=4)
    feature_columns.append(embedding)

In [141]:
feature_columns

[NumericColumn(key='area_percentage', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='height_percentage', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='age', number_buckets=997, default_value=996), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1a92594650>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='count_floors_pre_eq', number_buckets=11, default_value=10), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1a9256e310>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='count_families', number_buckets=11, default_value=10), dimension=4, combiner='m

In [161]:
estimator = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[1024, 512, 512, 256, 256],
    optimizer="Adam",
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/xx/j9szjz0n3bn5nhwq1ms96lgjqp577y/T/tmp1gcey8mg', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a8e4c1cd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [162]:
estimator.train(input_fn=lambda: df_to_dataset(X_train, y_train, shuffle=True, batch_size=32),
steps=900)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/xx/j9szjz0n3bn5nhwq1ms96lgjqp577y/T/tmp1gcey8mg/model.ckpt.
INFO:tensorflow:loss = 1.0641019, step = 0
INFO:tensorflow:global_step/sec: 9.42335
INFO:tensorflow:loss = 0.8811115, step = 100 (10.613 sec)
INFO:tensorflow:global_step/sec: 19.9382
INFO:tensorflow:loss = 0.7375692, step = 200 (5.015 sec)
INFO:tensorflow:global_step/sec: 19.7179
INFO:tensorflow:loss = 0.8317565, step = 300 (5.072 sec)
INFO:

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x1a8e4c1390>

In [163]:
metrics = estimator.evaluate(input_fn=lambda: df_to_dataset(X_test, y_test, shuffle=False, batch_size=32))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-01T16:17:23Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xx/j9szjz0n3bn5nhwq1ms96lgjqp577y/T/tmp1gcey8mg/model.ckpt-900
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-01-16:17:58
INFO:tensorflow:Saving dict for global step 900: accuracy = 0.6734521593983231, average_loss = 0.7105239680559127, global_step = 900, loss = 0.71051645
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 900: /var/folders/xx/j9szjz0n3bn5nhwq1ms96lgjqp577y/

In [164]:
metrics

{'accuracy': 0.6734521593983231,
 'average_loss': 0.7105239680559127,
 'loss': 0.71051645,
 'global_step': 900}