In [None]:
import os
import shutil
import numpy as np
import tensorflow as tf
from datetime import datetime
print(tf.__version__)

In [None]:
PROJECT = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your PROJECT
REGION = "us-central1"            # Choose an available region for Cloud MLE
TFVERSION = "1.14"                # TF version for CMLE to use

In [None]:
# 나중에 하이퍼파라미터로 쓸수 있는것.

BUCKET     = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your BUCKET
DATA_DIR   = "gs://{}/babyweight/preproc".format(BUCKET)
DATA_DIR   = "."
OUTPUT_DIR = "babyweight_trained_"+datetime.now().strftime('%Y%m%d_%H%M%S')
PATTERN    = ""
TRAIN_STEPS = 1000
BATCH_SIZE = 128
NNSIZE = [64,32,1]
NNSIZE = [128,64,32]
NEMBEDS = 6
SAVE_CHECKPOINTS_SECS = 30
KEEP_CHECKPOINT_MAX = 10
EVAL_SECS= 30

def parameter_display():
    print('>>>>> BUCKET                : {}'.format(BUCKET))
    print('>>>>> DATA_DIR              : {}'.format(DATA_DIR))
    print('>>>>> OUTPUT_DIR            : {}'.format(OUTPUT_DIR))
    print('>>>>> PATTERN               : {}'.format(PATTERN))
    print('>>>>> TRAIN_STEPS           : {}'.format(TRAIN_STEPS))
    print('>>>>> BATCH_SIZE            : {}'.format(BATCH_SIZE))
    print('>>>>> NNSIZE                : {}'.format(NNSIZE))
    print('>>>>> NEMBEDS               : {}'.format(NEMBEDS))
    print('>>>>> SAVE_CHECKPOINTS_SECS : {}'.format(SAVE_CHECKPOINTS_SECS))
    print('>>>>> KEEP_CHECKPOINT_MAX   : {}'.format(KEEP_CHECKPOINT_MAX))
    print('>>>>> EVAL_SECS             : {}'.format(EVAL_SECS))  

In [None]:
os.environ["BUCKET"] = BUCKET
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION
os.environ["TFVERSION"] = TFVERSION

In [None]:
# Columns명 지정
CSV_COLUMNS = "weight_pounds,is_male,mother_age,plurality,gestation_weeks".split(',')
LABEL_COLUMN = "weight_pounds"
# Set default values for each CSV column
DEFAULTS = [[0.0], ["null"], [0.0], ["null"], [0.0]]

In [None]:
def add_engineered_features(features):
    features["dummy"] = features["mother_age"]
    
    return features

In [None]:
def read_dataset(data_dir, filename_pattern, mode, batch_size = 512):
    def _input_fn():
        def decode_csv(value_column):
            columns = tf.decode_csv(records = value_column, record_defaults = DEFAULTS)
            # Default feature
            features = dict(zip(CSV_COLUMNS, columns))
            # NEW: Add engineered features
            features = add_engineered_features(features)
            # Default label
            label = features.pop(LABEL_COLUMN)
            return features, label
    
        if PATTERN == "":
            file_path = "{}/{}".format(data_dir, filename_pattern)
        else:
            file_path = "{}/{}*{}*".format(data_dir, filename_pattern, PATTERN)
        print('>>>>> data filename : {}'.format(file_path))
        
        # Create list of files that match pattern
        file_list = tf.gfile.Glob(filename = file_path)

        # Create dataset from file list
        dataset = (tf.data.TextLineDataset(filenames = file_list)  # Read text file
                     .map(map_func = decode_csv))  # Transform each elem by applying decode_csv fn

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(count = num_epochs).batch(batch_size = batch_size)
        return dataset
    return _input_fn

In [None]:
def get_categorical_indicator(name, values):
    return tf.feature_column.indicator_column(
        categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key = name, vocabulary_list = values))

def get_feature_cols():
    # Vocabulary List
    voca_list_is_male = ["True","False","Unknown"]
    voca_list_plurality = ["Single(1)","Twins(2)","Triplets(3)","Quadruplets(4)","Quintuplets(5)","Multiple(2+)"]

    # Default Feature column
    fc_is_male = tf.feature_column.categorical_column_with_vocabulary_list(key="is_male", vocabulary_list=voca_list_is_male)
    fc_plurality = tf.feature_column.categorical_column_with_vocabulary_list(key="plurality", vocabulary_list=voca_list_plurality)
    fc_mother_age = tf.feature_column.numeric_column(key = "mother_age")
    fc_gestation_weeks = tf.feature_column.numeric_column(key = "gestation_weeks")
    # DNNRegressor모델인 경우에는, 아래의 함수를 써야 한다.
    # One-Hot Encoding을 해서 넣어야 하는거 같다.
    # fc_is_male   = get_categorical_indicator("is_male", voca_list_is_male)
    # fc_plurality = get_categorical_indicator("plurality", voca_list_plurality)

    
    # ADD Feature column
    fc_dummy = tf.feature_column.numeric_column(key = "dummy")

    # Bucketized columns
    fc_buckets_mother_age = tf.feature_column.bucketized_column(source_column = fc_mother_age, boundaries = np.arange(start = 15, stop = 45, step = 1).tolist())
    fc_buckets_gestation_weeks = tf.feature_column.bucketized_column(source_column = fc_gestation_weeks, boundaries = np.arange(start = 17, stop = 47, step = 1).tolist())
   
    # Embeded Feature columns
    crossed = tf.feature_column.crossed_column(keys=[fc_is_male,fc_plurality,fc_buckets_mother_age,fc_buckets_gestation_weeks], 
                                            hash_bucket_size = 20000)
    fc_embed = tf.feature_column.embedding_column(categorical_column = crossed, dimension = NEMBEDS)

    # Feature columns
    feature_columns = [fc_is_male,
                       fc_plurality,
                       fc_mother_age,
                       fc_gestation_weeks,
                       fc_dummy
                      ]
    
    # Sparse wide columns
    wide = [fc_is_male,fc_plurality,fc_buckets_mother_age,fc_buckets_gestation_weeks]
    
    #Deep colomns
    deep = [fc_mother_age,
            fc_gestation_weeks,
            fc_embed]
    
    return feature_columns, wide, deep

In [None]:
def serving_input_fn():
    feature_placeholders = {
        "is_male"        : tf.placeholder(dtype = tf.string,  shape = [None]),
        "mother_age"     : tf.placeholder(dtype = tf.float32, shape = [None]),
        "plurality"      : tf.placeholder(dtype = tf.string,  shape = [None]),
        "gestation_weeks": tf.placeholder(dtype = tf.float32, shape = [None])
    }
    
    features = add_engineered_features(feature_placeholders)
    
    # feature의 shape=(?,)이면, 아래 방법으로 차원을 증가시킨다.
    features = {
                key: tf.expand_dims(input = tensor, axis = -1)
                for key, tensor in feature_placeholders.items()
               }

    print(features["is_male"])
    return tf.estimator.export.ServingInputReceiver(features = features, receiver_tensors = feature_placeholders)

In [None]:
# BestExporter를 사용하게 되면, 아래 Function이 있어야 하며, compare_fn으로 사용한다.
def _accuracy_bigger(best_eval_result, current_eval_result):
    metric = 'accuracy'
    return best_eval_result[metric] < current_eval_result[metric]

In [None]:
def train_and_evaluate(output_dir):
    parameter_display()
    
    feature_columns, wide, deep = get_feature_cols()
        
    run_config = tf.estimator.RunConfig(
        save_checkpoints_secs = SAVE_CHECKPOINTS_SECS,
        keep_checkpoint_max = KEEP_CHECKPOINT_MAX)

    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = output_dir,
        linear_feature_columns = wide,
        dnn_feature_columns = deep,
        dnn_hidden_units = NNSIZE,
        config = run_config)
 
    train_spec = tf.estimator.TrainSpec(
        input_fn = read_dataset(DATA_DIR, "train.csv", mode = tf.estimator.ModeKeys.TRAIN, batch_size=BATCH_SIZE),
        max_steps = TRAIN_STEPS)
    
    Final_exporter = tf.estimator.FinalExporter('./exporter', serving_input_receiver_fn=serving_input_fn)
    exporters = [Final_exporter]
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn = read_dataset(DATA_DIR, "eval.csv", mode = tf.estimator.ModeKeys.EVAL, batch_size=BATCH_SIZE),
        steps = None,
        start_delay_secs = 60, # start evaluating after N seconds
        throttle_secs = EVAL_SECS,  # evaluate every N seconds
        exporters = exporters)
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [None]:
!rm -rf {OUTPUT_DIR}
train_and_evaluate(OUTPUT_DIR)

In [None]:
!rm -rf {OUTPUT_DIR}
train_and_evaluate(OUTPUT_DIR)

In [None]:
%%bash
touch babyweight/trainer_20191130/__init__.py

In [None]:
%%writefile babyweight/trainer_20191130/task.py
import argparse
import json
import os

from . import model

import tensorflow as tf


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bucket",
        help="GCS path to data. We assume that data is in \
        gs://BUCKET/babyweight/preproc/",
        required=True
    )
    parser.add_argument(
        "--data_dir",
        help="train and eval data directory",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        required=True
    )
    parser.add_argument(
        "--pattern",
        help="data file pattern",
        required=True
    )
    parser.add_argument(
        "--train_steps",
        help="Number of Train Step.",
        type=int,
        default=1000
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=512
    )
    parser.add_argument(
        "--job-dir",
        help="this model ignores this field, but it is required by gcloud",
        default="junk"
    )
    parser.add_argument(
        "--nnsize",
        help="Hidden layer sizes to use for DNN feature columns -- provide \
        space-separated layers",
        nargs="+",
        type=int,
        default=[128, 32, 4]
    )
    parser.add_argument(
        "--nembeds",
        help="Embedding size of a cross of n key real-valued parameters",
        type=int,
        default=6
    )
    parser.add_argument(
        "--save_checkpoints_sec",
        help="",
        type=int,
        default=30
    )
    parser.add_argument(
        "--keep_checkpoints_max",
        help="",
        type=int,
        default=10
    )
    parser.add_argument(
        "--eval_secs",
        help="",
        type=int,
        default=30
    )

    # Parse arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Pop unnecessary args needed for gcloud
    arguments.pop("job-dir", None)

    # Assign the arguments to the model variables
    output_dir                  = arguments.pop("output_dir")
    model.OUTPUT_DIR            = output_dir
    model.BUCKET                = arguments.pop("bucket")
    model.DATA_DIR              = arguments.pop("data_dir")
    model.PATTERN               = arguments.pop("pattern")
    model.TRAIN_STEPS           = arguments.pop("train_steps")
    model.BATCH_SIZE            = arguments.pop("batch_size")
    model.NNSIZE                = arguments.pop("nnsize")
    model.NEMBEDS               = arguments.pop("nembeds")
    model.SAVE_CHECKPOINTS_SECS = arguments.pop("save_checkpoints_sec")
    model.KEEP_CHECKPOINT_MAX   = arguments.pop("keep_checkpoints_max")
    model.EVAL_SECS             = arguments.pop("eval_secs")
    
    # Append trial_id to path if we are doing hptuning
    # This code can be removed if you are not using hyperparameter tuning
    output_dir = os.path.join(
        output_dir,
        json.loads(
            os.environ.get("TF_CONFIG", "{}")
        ).get("task", {}).get("trial", "")
    )

    # Run the training job
    model.train_and_evaluate(output_dir)


In [None]:
%%writefile babyweight/trainer_20191130/model.py
# -*- coding: utf-8 -*- 
import os
import shutil
import numpy as np
import tensorflow as tf
from datetime import datetime
print(tf.__version__)

def parameter_display():
    print('>>>>> BUCKET                : {}'.format(BUCKET))
    print('>>>>> DATA_DIR              : {}'.format(DATA_DIR))
    print('>>>>> OUTPUT_DIR            : {}'.format(OUTPUT_DIR))
    print('>>>>> PATTERN               : {}'.format(PATTERN))
    print('>>>>> TRAIN_STEPS           : {}'.format(TRAIN_STEPS))
    print('>>>>> BATCH_SIZE            : {}'.format(BATCH_SIZE))
    print('>>>>> NNSIZE                : {}'.format(NNSIZE))
    print('>>>>> NEMBEDS               : {}'.format(NEMBEDS))
    print('>>>>> SAVE_CHECKPOINTS_SECS : {}'.format(SAVE_CHECKPOINTS_SECS))
    print('>>>>> KEEP_CHECKPOINT_MAX   : {}'.format(KEEP_CHECKPOINT_MAX))
    print('>>>>> EVAL_SECS             : {}'.format(EVAL_SECS))  
    

######################################################################
# Columns명 지정
CSV_COLUMNS = "weight_pounds,is_male,mother_age,plurality,gestation_weeks".split(',')
LABEL_COLUMN = "weight_pounds"
# Set default values for each CSV column
DEFAULTS = [[0.0], ["null"], [0.0], ["null"], [0.0]]

######################################################################
def add_engineered_features(features):
    features["dummy"] = features["mother_age"]
    return features

######################################################################
def get_categorical_indicator(name, values):
    return tf.feature_column.indicator_column(
        categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key = name, vocabulary_list = values))

######################################################################
def read_dataset(data_dir, filename_pattern, mode, batch_size = 512):
    def _input_fn():
        def decode_csv(value_column):
            columns = tf.decode_csv(records = value_column, record_defaults = DEFAULTS)
            # Default feature
            features = dict(zip(CSV_COLUMNS, columns))
            # NEW: Add engineered features
            features = add_engineered_features(features)
            # Default label
            label = features.pop(LABEL_COLUMN)
            return features, label
    
        if PATTERN == "":
            file_path = "{}/{}".format(data_dir, filename_pattern)
        else:
            file_path = "{}/{}*{}*".format(data_dir, filename_pattern, PATTERN)
        print('>>>>> data filename : {}'.format(file_path))
        
        # Create list of files that match pattern
        file_list = tf.gfile.Glob(filename = file_path)

        # Create dataset from file list
        dataset = (tf.data.TextLineDataset(filenames = file_list)  # Read text file
                     .map(map_func = decode_csv))  # Transform each elem by applying decode_csv fn

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(count = num_epochs).batch(batch_size = batch_size)
        return dataset
    return _input_fn

######################################################################
def get_feature_cols():
    # Vocabulary List
    voca_list_is_male = ["True","False","Unknown"]
    voca_list_plurality = ["Single(1)","Twins(2)","Triplets(3)","Quadruplets(4)","Quintuplets(5)","Multiple(2+)"]

    # Default Feature column
    fc_is_male = tf.feature_column.categorical_column_with_vocabulary_list(key="is_male", vocabulary_list=voca_list_is_male)
    fc_plurality = tf.feature_column.categorical_column_with_vocabulary_list(key="plurality", vocabulary_list=voca_list_plurality)
    fc_mother_age = tf.feature_column.numeric_column(key = "mother_age")
    fc_gestation_weeks = tf.feature_column.numeric_column(key = "gestation_weeks")
    # DNNRegressor모델인 경우에는, 아래의 함수를 써야 한다.
    # One-Hot Encoding을 해서 넣어야 하는거 같다.
    # fc_is_male   = get_categorical_indicator("is_male", voca_list_is_male)
    # fc_plurality = get_categorical_indicator("plurality", voca_list_plurality)

    
    # ADD Feature column
    fc_dummy = tf.feature_column.numeric_column(key = "dummy")

    # Bucketized columns
    fc_buckets_mother_age = tf.feature_column.bucketized_column(source_column = fc_mother_age, boundaries = np.arange(start = 15, stop = 45, step = 1).tolist())
    fc_buckets_gestation_weeks = tf.feature_column.bucketized_column(source_column = fc_gestation_weeks, boundaries = np.arange(start = 17, stop = 47, step = 1).tolist())
   
    # Embeded Feature columns
    crossed = tf.feature_column.crossed_column(keys=[fc_is_male,fc_plurality,fc_buckets_mother_age,fc_buckets_gestation_weeks], 
                                            hash_bucket_size = 20000)
    fc_embed = tf.feature_column.embedding_column(categorical_column = crossed, dimension = NEMBEDS)

    # Feature columns
    feature_columns = [fc_is_male,
                       fc_plurality,
                       fc_mother_age,
                       fc_gestation_weeks,
                       fc_dummy
                      ]
    
    # Sparse wide columns
    wide = [fc_is_male,fc_plurality,fc_buckets_mother_age,fc_buckets_gestation_weeks]
    
    #Deep colomns
    deep = [fc_mother_age,
            fc_gestation_weeks,
            fc_embed]
    
    return feature_columns, wide, deep

######################################################################
def serving_input_fn():
    feature_placeholders = {
        "is_male"        : tf.placeholder(dtype = tf.string,  shape = [None]),
        "mother_age"     : tf.placeholder(dtype = tf.float32, shape = [None]),
        "plurality"      : tf.placeholder(dtype = tf.string,  shape = [None]),
        "gestation_weeks": tf.placeholder(dtype = tf.float32, shape = [None])
    }
    
    features = add_engineered_features(feature_placeholders)
    
    # feature의 shape=(?,)이면, 아래 방법으로 차원을 증가시킨다.
    features = {
                key: tf.expand_dims(input = tensor, axis = -1)
                for key, tensor in feature_placeholders.items()
               }
    return tf.estimator.export.ServingInputReceiver(features = features, receiver_tensors = feature_placeholders)


######################################################################
# BestExporter를 사용하게 되면, 아래 Function이 있어야 하며, compare_fn으로 사용한다.
def _accuracy_bigger(best_eval_result, current_eval_result):
    metric = 'accuracy'
    return best_eval_result[metric] < current_eval_result[metric]

######################################################################
def train_and_evaluate(output_dir):
    parameter_display()
    feature_columns, wide, deep = get_feature_cols()
        
    run_config = tf.estimator.RunConfig(
        save_checkpoints_secs = SAVE_CHECKPOINTS_SECS,
        keep_checkpoint_max = KEEP_CHECKPOINT_MAX)

    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = output_dir,
        linear_feature_columns = wide,
        dnn_feature_columns = deep,
        dnn_hidden_units = NNSIZE,
        config = run_config)
 
    train_spec = tf.estimator.TrainSpec(
        input_fn = read_dataset(DATA_DIR, "train.csv", mode = tf.estimator.ModeKeys.TRAIN, batch_size=BATCH_SIZE),
        max_steps = TRAIN_STEPS)
    
    Final_exporter = tf.estimator.FinalExporter('./exporter', serving_input_receiver_fn=serving_input_fn)
    exporters = [Final_exporter]
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn = read_dataset(DATA_DIR, "eval.csv", mode = tf.estimator.ModeKeys.EVAL, batch_size=BATCH_SIZE),
        steps = None,
        start_delay_secs = 60, # start evaluating after N seconds
        throttle_secs = EVAL_SECS,  # evaluate every N seconds
        exporters = exporters)
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    

# Model Training을 해보자.

In [None]:
# 실행전 변수 정의
DATA_DIR   = "gs://{}/babyweight/preproc".format(BUCKET)
DATA_DIR   = "."
OUTPUT_DIR = "babyweight_trained_"+datetime.now().strftime('%Y%m%d_%H%M%S')
print(OUTPUT_DIR)

In [None]:
os.environ["BUCKET"] = BUCKET
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION
os.environ["TFVERSION"] = TFVERSION
os.environ["OUTPUT_DIR"] = OUTPUT_DIR

In [None]:
%%bash
echo "bucket=$BUCKET"
echo "output_dir=$OUTPUT_DIR"
rm -rf $OUTPUT_DIR
export PYTHONPATH=${PYTHONPATH}:${PWD}/babyweight
python -m trainer_20191130.task \
    --bucket=$BUCKET \
    --data_dir=. \
    --output_dir=$OUTPUT_DIR \
    --job-dir=./tmp \
    --pattern=""\
    --train_steps=1000 \
    --batch_size=128 \
    --nembeds=6 \
    --save_checkpoints_sec=30 \
    --keep_checkpoints_max=10 \
    --eval_secs=30

In [None]:
%%writefile inputs.json
{"is_male": "True", "mother_age": 26.0, "plurality": "Single(1)", "gestation_weeks": 39}
{"is_male": "False", "mother_age": 26.0, "plurality": "Single(1)", "gestation_weeks": 39}

In [None]:
%%bash
MODEL_LOCATION=$(ls -d $(pwd)/babyweight_trained_20191130_045846/export/exporter/* | tail -1)
echo $MODEL_LOCATION
gcloud ml-engine local predict --model-dir=$MODEL_LOCATION --json-instances=inputs.json

# MOdel을 CMLE에 올려보자.

In [None]:
# 실행전 변수 정의
DATA_DIR   = "gs://{}/babyweight/preproc".format(BUCKET)
OUTPUT_DIR = "babyweight_trained_"+datetime.now().strftime('%Y%m%d_%H%M%S')
print(OUTPUT_DIR)

In [None]:
%%bash
OUTDIR=gs://${BUCKET}/babyweight/trained_model_20191130
JOBNAME=babyweight_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ai-platform jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer_20191130.task \
    --package-path=$(pwd)/babyweight/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=PREMIUM_1 \
    --runtime-version=$TFVERSION \
    -- \
    --bucket=${BUCKET} \
    --output_dir=${OUTDIR} \
    --train_examples=200000

In [None]:
os.environ["BUCKET"] = BUCKET
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION
os.environ["TFVERSION"] = TFVERSION
os.environ["OUTPUT_DIR"] = OUTPUT_DIR