In [1]:
PROJECT = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your PROJECT
BUCKET = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your BUCKET
REGION = "us-central1"            # Choose an available region for Cloud CAIP
TFVERSION = "1.14"                # TF version for CMLE to use

In [2]:
import os
os.environ["BUCKET"] = BUCKET
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION
os.environ["TFVERSION"] = TFVERSION

In [3]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [4]:
%%bash
if ! gsutil ls -r gs://$BUCKET | grep -q gs://$BUCKET/babyweight/preproc; then
    gsutil mb -l ${REGION} gs://${BUCKET}
    # copy canonical set of preprocessed files if you didn't do previous notebook
    gsutil -m cp -R gs://cloud-training-demos/babyweight gs://${BUCKET}
fi

In [5]:
%%bash
gsutil ls gs://${BUCKET}/babyweight/preproc/*-00000*

gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/preproc/eval.csv-00000-of-00009
gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/preproc/train.csv-00000-of-00067


# Train on Cloud AI Platform

In [7]:
%%bash
touch babyweight/trainer/__init__.py

In [8]:
%%writefile babyweight/trainer/task.py
import argparse
import json
import os

from . import model

import tensorflow as tf


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bucket",
        help="GCS path to data. We assume that data is in \
        gs://BUCKET/babyweight/preproc/",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        required=True
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=512
    )
    parser.add_argument(
        "--job-dir",
        help="this model ignores this field, but it is required by gcloud",
        default="junk"
    )
    parser.add_argument(
        "--nnsize",
        help="Hidden layer sizes to use for DNN feature columns -- provide \
        space-separated layers",
        nargs="+",
        type=int,
        default=[128, 32, 4]
    )
    parser.add_argument(
        "--nembeds",
        help="Embedding size of a cross of n key real-valued parameters",
        type=int,
        default=3
    )
    parser.add_argument(
        "--train_examples",
        help="Number of examples (in thousands) to run the training job over. \
        If this is more than actual \
        So specifying 1000 here when you have only 100k examples \
        makes this 10 epochs.",
        type=int,
        default=5000
    )
    parser.add_argument(
        "--pattern",
        help="Specify a pattern that has to be in input files. \
        For example 00001-of \
        will process only one shard",
        default="of"
    )
    parser.add_argument(
        "--eval_steps",
        help="Positive number of steps for which to evaluate model. \
        Default to None, which means to evaluate until \
        input_fn raises an end-of-input exception",
        type=int,
        default=None
    )

    # Parse arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Pop unnecessary args needed for gcloud
    arguments.pop("job-dir", None)

    # Assign the arguments to the model variables
    output_dir = arguments.pop("output_dir")
    model.BUCKET = arguments.pop("bucket")
    model.BATCH_SIZE = arguments.pop("batch_size")
    model.TRAIN_STEPS = (
        arguments.pop("train_examples") * 1000) / model.BATCH_SIZE
    model.EVAL_STEPS = arguments.pop("eval_steps")
    print ("Will train for {} steps using batch_size={}".format(
        model.TRAIN_STEPS, model.BATCH_SIZE))
    model.PATTERN = arguments.pop("pattern")
    model.NEMBEDS = arguments.pop("nembeds")
    model.NNSIZE = arguments.pop("nnsize")
    print ("Will use DNN size of {}".format(model.NNSIZE))

    # Append trial_id to path if we are doing hptuning
    # This code can be removed if you are not using hyperparameter tuning
    output_dir = os.path.join(
        output_dir,
        json.loads(
            os.environ.get("TF_CONFIG", "{}")
        ).get("task", {}).get("trial", "")
    )

    # Run the training job
    model.train_and_evaluate(output_dir)

Writing babyweight/trainer/task.py


In [9]:
%%writefile babyweight/trainer/model.py
import shutil

import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

BUCKET = None  # set from task.py
PATTERN = "of"

CSV_COLUMNS = [
    "weight_pounds",
    "is_male",
    "mother_age",
    "plurality",
    "gestation_weeks",
]
LABEL_COLUMN = "weight_pounds"
DEFAULTS = [[0.0], ["null"], [0.0], ["null"], [0.0]]

TRAIN_STEPS = 10000
EVAL_STEPS = None
BATCH_SIZE = 512
NEMBEDS = 3
NNSIZE = [64, 16, 4]


def read_dataset(filename_pattern, mode, batch_size):

    def _input_fn():

        def decode_csv(value_column):
            columns = tf.decode_csv(
                records=value_column,
                record_defaults=DEFAULTS
            )
            features = dict(zip(CSV_COLUMNS, columns))
            label = features.pop(LABEL_COLUMN)
            return features, label

        file_path = "gs://{}/babyweight/preproc/{}*{}*".format(
            BUCKET, filename_pattern, PATTERN)
        file_list = tf.gfile.Glob(filename=file_path)

        dataset = (
            tf.data.TextLineDataset(filenames=file_list).map(
                map_func=decode_csv)
        )

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None  # indefinitely
            dataset = dataset.shuffle(buffer_size=10*batch_size)
        else:
            num_epochs = 1
        dataset = dataset.repeat(count=num_epochs).batch(batch_size=batch_size)

        return dataset

    return _input_fn


def get_wide_deep():

    fc_is_male = tf.feature_column.categorical_column_with_vocabulary_list(
        key="is_male",
        vocabulary_list=["True", "False", "Unknown"]
    )

    fc_plurality = tf.feature_column.categorical_column_with_vocabulary_list(
        key="plurality",
        vocabulary_list=[
            "Single(1)",
            "Twins(2)",
            "Triplets(3)",
            "Quadruplets(4)",
            "Quintuplets(5)",
            "Multiple(2+)"
        ]
    )

    fc_mother_age = tf.feature_column.numeric_column("mother_age")

    fc_gestation_weeks = tf.feature_column.numeric_column("gestation_weeks")

    fc_age_buckets = tf.feature_column.bucketized_column(
        source_column=fc_mother_age, 
        boundaries=np.arange(start=15, stop=45, step=1).tolist()
    )

    fc_gestation_buckets = tf.feature_column.bucketized_column(
        source_column=fc_gestation_weeks,
        boundaries=np.arange(start=17, stop=47, step=1).tolist())

    wide = [
        fc_is_male,
        fc_plurality,
        fc_age_buckets,
        fc_gestation_buckets
    ]

    # Feature cross all the wide columns and embed into a lower dimension
    crossed = tf.feature_column.crossed_column(
        keys=wide, hash_bucket_size=20000
    )
    fc_embed = tf.feature_column.embedding_column(
        categorical_column=crossed,
        dimension=3
    )

    # Continuous columns are deep, have a complex relationship with the output
    deep = [
        fc_mother_age,
        fc_gestation_weeks,
        fc_embed
    ]

    return wide, deep


def serving_input_fn():
    feature_placeholders = {
        "is_male": tf.placeholder(dtype=tf.string, shape=[None]),
        "mother_age": tf.placeholder(dtype=tf.float32, shape=[None]),
        "plurality": tf.placeholder(dtype=tf.string, shape=[None]),
        "gestation_weeks": tf.placeholder(dtype=tf.float32, shape=[None])
    }

    features = {
        key: tf.expand_dims(input=tensor, axis=-1)
        for key, tensor in feature_placeholders.items()
    }

    return tf.estimator.export.ServingInputReceiver(
        features=features, 
        receiver_tensors=feature_placeholders
    )


def my_rmse(labels, predictions):
    pred_values = predictions["predictions"]
    return {
        "rmse": tf.metrics.root_mean_squared_error(
            labels=labels,
            predictions=pred_values
        )
    }


def train_and_evaluate(output_dir):
    wide, deep = get_wide_deep()
    EVAL_INTERVAL = 300  # seconds

    run_config = tf.estimator.RunConfig(
        save_checkpoints_secs=EVAL_INTERVAL,
        keep_checkpoint_max=3)

    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir=output_dir,
        linear_feature_columns=wide,
        dnn_feature_columns=deep,
        dnn_hidden_units=NNSIZE,
        config=run_config)

    estimator = tf.contrib.estimator.add_metrics(estimator, my_rmse)

    train_spec = tf.estimator.TrainSpec(
        input_fn=read_dataset(
            "train", tf.estimator.ModeKeys.TRAIN, BATCH_SIZE),
        max_steps=TRAIN_STEPS)

    exporter = tf.estimator.LatestExporter(
        name="exporter",
        serving_input_receiver_fn=serving_input_fn,
        exports_to_keep=None)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=read_dataset(
            "eval", tf.estimator.ModeKeys.EVAL, 2**15),
        steps=EVAL_STEPS,
        start_delay_secs=60,  # start evaluating after N seconds
        throttle_secs=EVAL_INTERVAL,  # evaluate every N seconds
        exporters=exporter)

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec,
        eval_spec=eval_spec
    )

Writing babyweight/trainer/model.py


# Train locally

In [10]:
%%bash
echo "bucket=$BUCKET"
rm -rf babyweight_trained
export PYTHONPATH=${PYTHONPATH}:${PWD}/babyweight
python -m trainer.task \
    --bucket=$BUCKET \
    --output_dir=babyweight_trained \
    --job-dir=./tmp \
    --pattern="00000-of-"\
    --train_examples=1 \
    --eval_steps=1

bucket=qwiklabs-gcp-ml-49b827b781ab
Will train for 1 steps using batch_size=512
Will use DNN size of [128, 32, 4]




INFO:tensorflow:Using config: {'_save_checkpoints_secs': 300, '_num_ps_replicas': 0, '_keep_checkpoint_max': 3, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f34ab22ccd0>, '_model_dir': 'babyweight_trained/', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_session_creation_timeout_secs': 7200, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_experimental_max_worker_delay_secs': None, '_evaluation_master': '', '_eval_distribute': None, '_train_distribute': None, '_master': ''}
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more informatio

# Making predictions

In [11]:
%%writefile inputs.json
{"is_male": "True", "mother_age": 26.0, "plurality": "Single(1)", "gestation_weeks": 39}
{"is_male": "False", "mother_age": 26.0, "plurality": "Single(1)", "gestation_weeks": 39}

Writing inputs.json


In [12]:
%%bash
MODEL_LOCATION=$(ls -d $(pwd)/babyweight_trained/export/exporter/* | tail -1)
echo $MODEL_LOCATION
gcloud ml-engine local predict --model-dir=$MODEL_LOCATION --json-instances=inputs.json

/home/jupyter/20191205/07_AI_platform_training_babyweight/babyweight_trained/export/exporter/1575637556
PREDICTIONS
[4.892280101776123]
[4.831698894500732]


If the signature defined in the model is not serving_default then you must specify it via --signature-name flag, otherwise the command may fail.






2019-12-06 13:06:30.070692: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2019-12-06 13:06:30.079043: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz
2019-12-06 13:06:30.079567: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55afd978a2c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2019-12-06 13:06:30.079597: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2019-12-06 13:06:30.080404: I tensorflow/core/common_runtime/process_util.cc:115] Creating 

# Training on the Cloud with CAIP

In [13]:
%%bash
OUTDIR=gs://${BUCKET}/babyweight/trained_model
JOBNAME=babyweight_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ai-platform jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer.task \
    --package-path=$(pwd)/babyweight/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=STANDARD_1 \
    --runtime-version=$TFVERSION \
    -- \
    --bucket=${BUCKET} \
    --output_dir=${OUTDIR} \
    --train_examples=200000

gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model us-central1 babyweight_191206_130706
jobId: babyweight_191206_130706
state: QUEUED


Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/#1574923694060219...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/checkpoint#1574924504643159...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/eval/#1574924471749295...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/eval/events.out.tfevents.1574924471.cmle-training-master-ed4b0a6134-0-nthgn#1574924818084924...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/events.out.tfevents.1574923694.cmle-training-worker-ed4b0a6134-0-gp66z#1574924423288836...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/events.out.tfevents.1574923714.cmle-training-master-ed4b0a6134-0-nthgn#1574924506144736...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/export/#1574924474546511...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/trained_model/export/exporter/#1574924474914585...
Removing gs://qwiklabs-gcp-ml-49b82

# Hyperparameter tuning

In [14]:
%%writefile hyperparam.yaml
trainingInput:
    scaleTier: STANDARD_1
    hyperparameters:
        hyperparameterMetricTag: rmse
        goal: MINIMIZE
        maxTrials: 20
        maxParallelTrials: 5
        enableTrialEarlyStopping: True
        params:
        - parameterName: batch_size
          type: INTEGER
          minValue: 8
          maxValue: 512
          scaleType: UNIT_LOG_SCALE
        - parameterName: nembeds
          type: INTEGER
          minValue: 3
          maxValue: 30
          scaleType: UNIT_LINEAR_SCALE
        - parameterName: nnsize
          type: INTEGER
          minValue: 64
          maxValue: 512
          scaleType: UNIT_LOG_SCALE

Writing hyperparam.yaml


In [15]:
%%bash
OUTDIR=gs://${BUCKET}/babyweight/hyperparam
JOBNAME=babyweight_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ai-platform jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer.task \
    --package-path=$(pwd)/babyweight/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=STANDARD_1 \
    --config=hyperparam.yaml \
    --runtime-version=$TFVERSION \
    -- \
    --bucket=${BUCKET} \
    --output_dir=${OUTDIR} \
    --eval_steps=10 \
    --train_examples=20000

gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam us-central1 babyweight_191206_130758
jobId: babyweight_191206_130758
state: QUEUED


Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/#1574919254300894...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/#1574919263763836...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/checkpoint#1574920826422061...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/eval/#1574919996511080...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/eval/events.out.tfevents.1574919996.cmle-training-master-6963d62349-0-qk8hr#1574920865930950...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/events.out.tfevents.1574919263.cmle-training-worker-6963d62349-0-gk6kc#1574920820934772...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/events.out.tfevents.1574919370.cmle-training-master-6963d62349-0-qk8hr#1574920828998454...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperparam/1/export/#1574920000370750...
Removing gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/hyperpa

# Repeat training
### 위에서 HyperParameter를 한후 제일 좋은 것을 가지고, 아래와 같이 training을 한다.

In [None]:
%%bash
OUTDIR=gs://${BUCKET}/babyweight/trained_model_tuned
JOBNAME=babyweight_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ai-platform jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer.task \
    --package-path=$(pwd)/babyweight/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=STANDARD_1 \
    --runtime-version=$TFVERSION \
    -- \
    --bucket=${BUCKET} \
    --output_dir=${OUTDIR} \
    --train_examples=20000 --batch_size=35 --nembeds=16 --nnsize=281