In [1]:
import os
import tensorflow as tf
import argparse
from tensorflow.python.keras.callbacks import Callback

In [5]:
class MyModel(object):
    def train(self):
        mnist = tf.keras.datasets.mnist
        # 입력 값을 받게 추가합니다.
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate', required=False, type=float, default=0.001)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.3)
        parser.add_argument('--opt', required=False, type=int, default=1)
        parser.add_argument('--checkpoint_dir', required=False, default='/reuslt/training_checkpoints')
        parser.add_argument('--saved_model_dir', required=False, default='/result/saved_model_v2/001')
        parser.add_argument('--tensorboard_log', required=False, default='/result/log')
        args = parser.parse_args()
        
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0
        
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(args.dropout_rate),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        model.summary()
        
        sgd = tf.keras.optimizers.SGD(lr=args.learning_rate)
        adam = tf.keras.optimizers.Adam(lr=args.learning_rate)
        optimizers= [sgd, adam]
        model.compile(optimizer=optimizers[args.opt],
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])
        # 체크포인트를 저장할 체크포인트 디렉터리를 지정합니다.
        checkpoint_dir = args.checkpoint_dir
        # 체크포인트 파일의 이름
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
        
        model.fit(x_train, y_train,
                  verbose=0,
                  validation_data=(x_test, y_test),
                  epochs=5,
                  callbacks=[KatibMetricLog(),
                             tf.keras.callbacks.TensorBoard(log_dir=args.tensorboard_log),
                             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                                save_weights_only=True)
                            ])
        path = args.saved_model_dir
        model.save(path, save_format='tf')

In [6]:
class KatibMetricLog(Callback):
    def on_batch_end(self, batch, logs={}):
        print("batch", str(batch),
              "accuracy=" + str(logs.get('acc')),
              "loss=" + str(logs.get('loss')))
    
    def on_epoch_begin(self, epoch, logs={}):
        print("epoch " + str(epoch) + ":")
    
    def on_epoch_end(self, epoch, logs={}):
        print("Validation-accuracy=" + str(logs.get('val_acc')),
              "Validation-loss=" + str(logs.get('val_loss')))
        return

In [8]:
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        fairing.config.set_builder(
            'append',
            image_name='katib-job',
            base_image='kubeflow-registry.default.svc.cluster.local:30000/kubeflow-jupyterlab:tf2.3-cpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 1GiB
        fairing.config.set_deployer('job',
                                    namespace='koock',
                                    pod_spec_mutators=[
                                        k8s_utils.mounting_pvc(pvc_name="workspace-lecture-tf", 
                                                              pvc_mount_path="/result"),
                                        k8s_utils.get_resource_mutator(cpu=2,
                                                                       memory=5)]
                                    )
        # python3
        #fairing.config.set_preprocessor('python', input_files=[__file__])
        fairing.config.run()
    else:
        remote_train = MyModel()
        remote_train.train()

[I 200926 19:38:57 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7fd2743a9828>
[I 200926 19:38:57 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7fd274421198>
[I 200926 19:38:57 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7fd274421208>
[W 200926 19:38:57 append:50] Building image using Append builder...
[I 200926 19:38:57 base:107] Creating docker context: /tmp/fairing_context_7c0g8ts2
[I 200926 19:38:58 converted_notebook:127] Converting mnist-saved-model.ipynb to mnist-saved-model.py
[I 200926 19:38:58 docker_creds_:234] Loading Docker credentials for repository 'kubeflow-registry.default.svc.cluster.local:30000/kubeflow-jupyterlab:tf2.3-cpu'
[W 200926 19:38:58 append:54] Image successfully built in 0.5800367689807899s.
[W 200926 19:38:58 append:94] Pushing image kubeflow-registry.default.svc.cluster.local:30000/katib-job:E15A

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


[I 200926 19:39:08 manager:302] Pod started running True


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
2020-09-26 19:42:19.313734: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-09-26 19:42:19.399308: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2497105000 Hz
2020-09-26 19:42:19.399690: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x556300b65580 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-26 19:42:19.399736: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-09-26 19:42:19.864180: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session started.
2020-09-26 19:42:26

[W 200926 19:42:51 job:173] Cleaning up job fairing-job-4c9q5...
