In [1]:
import os
import tensorflow as tf
import argparse
from tensorflow.python.keras.callbacks import Callback

In [2]:
class MyModel(object):
    def train(self):
        mnist = tf.keras.datasets.mnist

        # 입력 값을 받게 추가합니다.
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate', required=False, type=float, default=0.01)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.2)
        args = parser.parse_args()

        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0

        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(args.dropout_rate),
            tf.keras.layers.Dense(10, activation='softmax')
        ])

        sgd = tf.keras.optimizers.SGD(lr=args.learning_rate,
                                      decay=1e-6,
                                      momentum=0.9,
                                      nesterov=True)

        model.compile(optimizer=sgd,
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])

        model.fit(x_train, y_train,
                  verbose=0,
                  validation_data=(x_test, y_test),
                  epochs=5,
                  callbacks=[KatibMetricLog()])

In [3]:
class KatibMetricLog(Callback):
    def on_batch_end(self, batch, logs={}):
        print("batch", str(batch),
              "accuracy=" + str(logs.get('acc')),
              "loss=" + str(logs.get('loss')))

    def on_epoch_begin(self, epoch, logs={}):
        print("epoch " + str(epoch) + ":")

    def on_epoch_end(self, epoch, logs={}):
        print("Validation-accuracy=" + str(logs.get('val_acc')),
              "Validation-loss=" + str(logs.get('val_loss')))
        return

In [4]:
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        fairing.config.set_builder(
            'append',
            image_name='katib-job',
            base_image='kubeflow-registry.default.svc.cluster.local:30000/kubeflow-jupyterlab:tf2.3-cpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 1GiB
        fairing.config.set_deployer('job',
                                    namespace='koock'
                                    )
        # python3
        #fairing.config.set_preprocessor('python', input_files=[__file__])
        fairing.config.run()
    else:
        remote_train = MyModel()
        remote_train.train()

[I 200924 08:23:43 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7feead6f1518>
[I 200924 08:23:43 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7feef9fae0b8>
[I 200924 08:23:43 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7feeab105fd0>
[W 200924 08:23:43 append:50] Building image using Append builder...
[I 200924 08:23:43 base:107] Creating docker context: /tmp/fairing_context_i5lpuoq5
[I 200924 08:23:43 converted_notebook:127] Converting mnist-katib.ipynb to mnist-katib.py
[I 200924 08:23:43 docker_creds_:234] Loading Docker credentials for repository 'kubeflow-registry.default.svc.cluster.local:30000/kubeflow-jupyterlab:tf2.3-cpu'
[W 200924 08:23:44 append:54] Image successfully built in 1.0748542040000757s.
[W 200924 08:23:44 append:94] Pushing image kubeflow-registry.default.svc.cluster.local:30000/katib-job:54BEA6B4...
[I 2

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
2020-09-24 08:55:50.532058: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-09-24 08:55:50.714843: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2497105000 Hz
2020-09-24 08:55:50.715141: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5622e7d72330 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-24 08:55:50.715175: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-09-24 08:57:23.346270: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 188160000 exceeds 10% of free sys

[W 200924 08:57:46 job:173] Cleaning up job fairing-job-qjf5j...
