In [1]:
PROJECT = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your PROJECT
BUCKET = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your BUCKET
REGION = "us-central1"            # Choose an available region for Cloud MLE
TFVERSION = "1.14"                # TF version for CMLE to use

In [2]:
import os
os.environ["BUCKET"] = BUCKET
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION
os.environ["TFVERSION"] = TFVERSION

In [3]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
    gsutil mb -l ${REGION} gs://${BUCKET}
fi

In [4]:
!gsutil cp gs://cloud-training-demos/taxifare/small/*.csv .
!ls -l *.csv

Copying gs://cloud-training-demos/taxifare/small/taxi-test.csv...
Copying gs://cloud-training-demos/taxifare/small/taxi-train.csv...              
Copying gs://cloud-training-demos/taxifare/small/taxi-valid.csv...              
- [3 files][ 10.9 MiB/ 10.9 MiB]                                                
Operation completed over 3 objects/10.9 MiB.                                     
-rw-r--r-- 1 jupyter jupyter 1799474 Dec  6 11:48 taxi-test.csv
-rw-r--r-- 1 jupyter jupyter 7986353 Dec  6 11:48 taxi-train.csv
-rw-r--r-- 1 jupyter jupyter 1673742 Dec  6 11:48 taxi-valid.csv


# Create TensorFlow model using TensorFlow's Estimator API

In [6]:
import shutil
import numpy as np
import tensorflow as tf
print(tf.__version__)

1.15.0


In [7]:
CSV_COLUMNS = "weight_pounds,is_male,mother_age,plurality,gestation_weeks".split(',')
LABEL_COLUMN = "weight_pounds"

# Set default values for each CSV column
DEFAULTS = [[0.0], ["null"], [0.0], ["null"], [0.0]]
TRAIN_STEPS = 1000

# Create the input function

In [8]:
def read_dataset(filename_pattern, mode, batch_size = 512):
    def _input_fn():
        def decode_csv(value_column):
            columns = tf.decode_csv(records = value_column, record_defaults = DEFAULTS)
            features = dict(zip(CSV_COLUMNS, columns))
            label = features.pop(LABEL_COLUMN)
            return features, label
    
        # Create list of files that match pattern
        file_list = tf.gfile.Glob(filename = filename_pattern)

        # Create dataset from file list
        dataset = (tf.data.TextLineDataset(filenames = file_list)  # Read text file
                     .map(map_func = decode_csv))  # Transform each elem by applying decode_csv fn

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(count = num_epochs).batch(batch_size = batch_size)
        return dataset
    return _input_fn

# Create the feature columns

In [10]:
def get_wide_deep():
    # Define column types
    fc_is_male,fc_plurality,fc_mother_age,fc_gestation_weeks = [\
        tf.feature_column.categorical_column_with_vocabulary_list(key = "is_male", 
                                                                  vocabulary_list = ["True", "False", "Unknown"]),
        tf.feature_column.categorical_column_with_vocabulary_list(key = "plurality", 
                                                                  vocabulary_list = ["Single(1)", "Twins(2)", "Triplets(3)", "Quadruplets(4)", "Quintuplets(5)", "Multiple(2+)"]),
        tf.feature_column.numeric_column(key = "mother_age"),
        tf.feature_column.numeric_column(key = "gestation_weeks")
    ]

    # Bucketized columns
    fc_age_buckets = tf.feature_column.bucketized_column(source_column = fc_mother_age, boundaries = np.arange(start = 15, stop = 45, step = 1).tolist())
    fc_gestation_buckets = tf.feature_column.bucketized_column(source_column = fc_gestation_weeks, boundaries = np.arange(start = 17, stop = 47, step = 1).tolist())

    # Sparse columns are wide, have a linear relationship with the output
    wide = [fc_is_male,
        fc_plurality,
        fc_age_buckets,
        fc_gestation_buckets]

    # Feature cross all the wide columns and embed into a lower dimension
    crossed = tf.feature_column.crossed_column(keys = wide, hash_bucket_size = 20000)
    fc_embed = tf.feature_column.embedding_column(categorical_column = crossed, dimension = 3)

    # Continuous columns are deep, have a complex relationship with the output
    deep = [fc_mother_age,
        fc_gestation_weeks,
        fc_embed]
    
    return wide, deep

# Create the Serving Input function

In [11]:
def serving_input_fn():
    feature_placeholders = {
        "is_male": tf.placeholder(dtype = tf.string, shape = [None]),
        "mother_age": tf.placeholder(dtype = tf.float32, shape = [None]),
        "plurality": tf.placeholder(dtype = tf.string, shape = [None]),
        "gestation_weeks": tf.placeholder(dtype = tf.float32, shape = [None])
    }
    
    features = {
        key: tf.expand_dims(input = tensor, axis = -1)
        for key, tensor in feature_placeholders.items()
    }
    
    return tf.estimator.export.ServingInputReceiver(features = features, receiver_tensors = feature_placeholders)

# Create the model and run training and evaluation

In [12]:
def train_and_evaluate(output_dir):
    wide, deep = get_wide_deep()
    
    EVAL_INTERVAL = 300
    
    run_config = tf.estimator.RunConfig(
        save_checkpoints_secs = EVAL_INTERVAL,
        keep_checkpoint_max = 3)
    
    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = output_dir,
        linear_feature_columns = wide,
        dnn_feature_columns = deep,
        dnn_hidden_units = [64, 32],
        config = run_config)
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = read_dataset("train.csv", mode = tf.estimator.ModeKeys.TRAIN),
        max_steps = TRAIN_STEPS)
    
    exporter = tf.estimator.LatestExporter(name = "exporter", serving_input_receiver_fn = serving_input_fn)
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn = read_dataset("eval.csv", mode = tf.estimator.ModeKeys.EVAL),
        steps = None,
        start_delay_secs = 60, # start evaluating after N seconds
        throttle_secs = EVAL_INTERVAL,  # evaluate every N seconds
        exporters = exporter)
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [13]:
# Run the model
shutil.rmtree(path = "babyweight_trained_wd", ignore_errors = True) # start fresh each time
train_and_evaluate("babyweight_trained_wd")

INFO:tensorflow:Using config: {'_experimental_distribute': None, '_save_checkpoints_steps': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f22ac9b1940>, '_tf_random_seed': None, '_model_dir': 'babyweight_trained_wd', '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_task_id': 0, '_save_checkpoints_secs': 300, '_protocol': None, '_session_creation_timeout_secs': 7200, '_global_id_in_cluster': 0, '_num_ps_replicas': 0, '_evaluation_master': '', '_master': '', '_log_step_count_steps': 100, '_num_worker_replicas': 1, '_service': None, '_save_summary_steps': 100, '_device_fn': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_experimental_max_worker_delay_secs': None, '_train_distribute': None, '_eval_distribute': None, '_task_type': 'worker'}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and ev