# Verify the environment

In [1]:
%%bash

uname -a
cat /etc/*-release

Linux gcp-ai-notebook 4.19.0-17-cloud-amd64 #1 SMP Debian 4.19.194-2 (2021-06-21) x86_64 GNU/Linux
PRETTY_NAME="Debian GNU/Linux 10 (buster)"
NAME="Debian GNU/Linux"
VERSION_ID="10"
VERSION="10 (buster)"
VERSION_CODENAME=buster
ID=debian
HOME_URL="https://www.debian.org/"
SUPPORT_URL="https://www.debian.org/support"
BUG_REPORT_URL="https://bugs.debian.org/"


In [2]:
%%bash

sudo apt-get install google-cloud-sdk-kpt -y

Reading package lists...
Building dependency tree...
Reading state information...
google-cloud-sdk-kpt is already the newest version (348.0.0-0).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [3]:
%%bash

gcloud config list project

[core]
project = zlc-test-2017



Your active configuration is: [default]


In [4]:
%%bash

gcloud config list account

[core]
account = 597279342139-compute@developer.gserviceaccount.com



Your active configuration is: [default]


# Create a cluster

In [5]:
CLUSTER_NAME = 'my-k8s-cluster'

In [6]:
%%bash -s "$CLUSTER_NAME"

gcloud config set compute/zone us-east4-a

gcloud container clusters create $1 \
  --project=$(gcloud config get-value project) \
  --cluster-version=latest \
  --machine-type=n1-standard-4 \
  --scopes compute-rw,gke-default,storage-rw \
  --num-nodes=3

NAME            LOCATION    MASTER_VERSION  MASTER_IP       MACHINE_TYPE   NODE_VERSION    NUM_NODES  STATUS
my-k8s-cluster  us-east4-a  1.20.8-gke.700  35.236.211.181  n1-standard-4  1.20.8-gke.700  3          RUNNING


Updated property [compute/zone].
Creating cluster my-k8s-cluster in us-east4-a...
......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Verify the cluster has been created

In [7]:
%%bash

gcloud container clusters list

NAME            LOCATION    MASTER_VERSION  MASTER_IP       MACHINE_TYPE   NODE_VERSION    NUM_NODES  STATUS
my-k8s-cluster  us-east4-a  1.20.8-gke.700  35.236.211.181  n1-standard-4  1.20.8-gke.700  3          RUNNING


## After the cluster has started, configure access credentials so you can interact with the cluster using kubectl.

In [8]:
%%bash -s "$CLUSTER_NAME"

gcloud container clusters get-credentials $1

Fetching cluster endpoint and auth data.
kubeconfig entry generated for my-k8s-cluster.


# Deploying `TFJob` components

## Get the manifests for `TFJob` from v1.1.0 of Kubeflow

In [9]:
%%bash

SRC_REPO=https://github.com/kubeflow/manifests
kpt pkg get $SRC_REPO/tf-training@v1.1.0 tf-training

ls -l

total 24
drwxr-xr-x 4 jupyter jupyter  4096 Jul 18 03:16 lab-files
-rw-r--r-- 1 jupyter jupyter 14159 Jul 18 04:22 tf-distributed-training-kubeflow.ipynb
drwxr-xr-x 5 jupyter jupyter  4096 Jul 18 02:59 tf-training


error: destination directory "tf-training/tf-training" already exists


## Create a Kubernetes namespace to host the `TFJob` operator

In [10]:
%%bash

kubectl create namespace kubeflow

namespace/kubeflow created


## Install the `TFJob` custom resource

In [11]:
%%bash

kubectl apply --kustomize tf-training/tf-job-crds/base

customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created




## Install the `TFJob` operator

In [12]:
%%bash

kubectl apply --kustomize tf-training/tf-job-operator/base

serviceaccount/tf-job-dashboard created
serviceaccount/tf-job-operator created
clusterrole.rbac.authorization.k8s.io/kubeflow-tfjobs-admin created
clusterrole.rbac.authorization.k8s.io/kubeflow-tfjobs-edit created
clusterrole.rbac.authorization.k8s.io/kubeflow-tfjobs-view created
clusterrole.rbac.authorization.k8s.io/tf-job-operator created
clusterrolebinding.rbac.authorization.k8s.io/tf-job-operator created
service/tf-job-operator created
deployment.apps/tf-job-operator created




## Verify the installation

In [13]:
%%bash

kubectl get deployments -n kubeflow -o wide

NAME              READY   UP-TO-DATE   AVAILABLE   AGE   CONTAINERS        IMAGES                                                        SELECTOR
tf-job-operator   0/1     1            0           1s    tf-job-operator   gcr.io/kubeflow-images-public/tf_operator:vmaster-ga2ae7bff   kustomize.component=tf-job-operator


# Creating a Cloud Storage bucket

In [14]:
%%bash -s "$CLUSTER_NAME"

gsutil mb gs://$(gcloud config get-value project)-bucket

Creating gs://zlc-test-2017-bucket/...


## Verify the bucket has been created

In [15]:
%%bash

gsutil ls

gs://zlc-test-2017-bucket/


# Preparing `TFJob`

## Download the code package

In [16]:
%%bash

SRC_REPO=https://github.com/GoogleCloudPlatform/mlops-on-gcp
kpt pkg get $SRC_REPO/workshops/mlep-qwiklabs/distributed-training-gke lab-files
ls -la

total 36
drwxr-xr-x 5 jupyter jupyter  4096 Jul 18 04:22 .
drwxr-xr-x 7 jupyter jupyter  4096 Jul 17 04:02 ..
drwxr-xr-x 2 jupyter jupyter  4096 Jul 17 04:02 .ipynb_checkpoints
drwxr-xr-x 4 jupyter jupyter  4096 Jul 18 03:16 lab-files
-rw-r--r-- 1 jupyter jupyter 14159 Jul 18 04:22 tf-distributed-training-kubeflow.ipynb
drwxr-xr-x 5 jupyter jupyter  4096 Jul 18 02:59 tf-training


error: destination directory "lab-files/distributed-training-gke" already exists


## Verify the package

In [17]:
%%bash

ls -Rlah lab-files

lab-files:
total 32K
drwxr-xr-x 4 jupyter jupyter 4.0K Jul 18 03:16 .
drwxr-xr-x 5 jupyter jupyter 4.0K Jul 18 04:22 ..
-rw-r--r-- 1 jupyter jupyter  803 Jul 18 03:16 Dockerfile
-rw------- 1 jupyter jupyter  295 Jul 18 03:16 Kptfile
-rw-r--r-- 1 jupyter jupyter 2.7K Jul 18 03:16 README.md
drwxr-xr-x 3 jupyter jupyter 4.0K Jul 18 03:16 distributed-training-gke
drwxr-xr-x 2 jupyter jupyter 4.0K Jul 18 03:16 mnist
-rw-r--r-- 1 jupyter jupyter  568 Jul 18 04:05 tfjob.yaml

lab-files/distributed-training-gke:
total 28K
drwxr-xr-x 3 jupyter jupyter 4.0K Jul 18 03:16 .
drwxr-xr-x 4 jupyter jupyter 4.0K Jul 18 03:16 ..
-rw-r--r-- 1 jupyter jupyter  803 Jul 18 03:16 Dockerfile
-rw------- 1 jupyter jupyter  310 Jul 18 03:16 Kptfile
-rw-r--r-- 1 jupyter jupyter 2.7K Jul 18 03:16 README.md
drwxr-xr-x 2 jupyter jupyter 4.0K Jul 18 03:16 mnist
-rw-r--r-- 1 jupyter jupyter 1.2K Jul 18 03:16 tfjob.yaml

lab-files/distributed-training-gke/mnist:
total 20K
drwxr-xr-x 2 jupyter jupyter 4.0K Jul 18 03:16 

In [18]:
%%bash

pygmentize -l yaml lab-files/tfjob.yaml

[94mapiVersion[39;49;00m: kubeflow.org/v1
[94mkind[39;49;00m: TFJob
[94mmetadata[39;49;00m:
  [94mname[39;49;00m: multi-worker
[94mspec[39;49;00m:
  [94mcleanPodPolicy[39;49;00m: None
  [94mtfReplicaSpecs[39;49;00m:
    [94mWorker[39;49;00m:
      [94mreplicas[39;49;00m: 3
      [94mtemplate[39;49;00m:
        [94mspec[39;49;00m:
          [94mcontainers[39;49;00m:
            - [94mname[39;49;00m: tensorflow
              [94mimage[39;49;00m: gcr.io/zlc-test-2017/mnist-train
              [94margs[39;49;00m:
                - --epochs=5
                - --steps_per_epoch=100
                - --per_worker_batch=64
                - --saved_model_path=gs://zlc-test-2017-bucket/saved_model_dir
                - --checkpoint_path=gs://zlc-test-2017-bucket/checkpoints


In [19]:
%%bash

pygmentize lab-files/mnist/main.py

[37m# Copyright 2020 Google. All Rights Reserved.[39;49;00m
[37m#[39;49;00m
[37m# Licensed under the Apache License, Version 2.0 (the "License");[39;49;00m
[37m# you may not use this file except in compliance with the License.[39;49;00m
[37m# You may obtain a copy of the License at[39;49;00m
[37m#[39;49;00m
[37m#     http://www.apache.org/licenses/LICENSE-2.0[39;49;00m
[37m#[39;49;00m
[37m# Unless required by applicable law or agreed to in writing, software[39;49;00m
[37m# distributed under the License is distributed on an "AS IS" BASIS,[39;49;00m
[37m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.[39;49;00m
[37m# See the License for the specific language governing permissions and[39;49;00m
[37m# limitations under the License.[39;49;00m
[33m"""An example of multi-worker training with Keras model using Strategy API."""[39;49;00m

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[

In [20]:
%%bash

pygmentize lab-files/mnist/model.py

[37m# Copyright 2020 Google. All Rights Reserved.[39;49;00m
[37m#[39;49;00m
[37m# Licensed under the Apache License, Version 2.0 (the "License");[39;49;00m
[37m# you may not use this file except in compliance with the License.[39;49;00m
[37m# You may obtain a copy of the License at[39;49;00m
[37m#[39;49;00m
[37m#     http://www.apache.org/licenses/LICENSE-2.0[39;49;00m
[37m#[39;49;00m
[37m# Unless required by applicable law or agreed to in writing, software[39;49;00m
[37m# distributed under the License is distributed on an "AS IS" BASIS,[39;49;00m
[37m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.[39;49;00m
[37m# See the License for the specific language governing permissions and[39;49;00m
[37m# limitations under the License.[39;49;00m
[33m"""An example of multi-worker training with Keras model using Strategy API."""[39;49;00m

[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [

## Packaging training code in a docker image and push to Container Registry

In [21]:
%%bash

IMAGE_NAME=mnist-train

cd lab-files
docker build -t gcr.io/$(gcloud config get-value project)/${IMAGE_NAME} .
docker push gcr.io/$(gcloud config get-value project)/${IMAGE_NAME}

Sending build context to Docker daemon  33.79kB
Step 1/4 : FROM tensorflow/tensorflow:2.4.1
 ---> 45872ba1e662
Step 2/4 : RUN pip install tensorflow_datasets
 ---> Using cache
 ---> f86add306ef2
Step 3/4 : ADD mnist mnist
 ---> Using cache
 ---> 489857121248
Step 4/4 : ENTRYPOINT ["python", "-m", "mnist.main"]
 ---> Using cache
 ---> ba6a0e7c95f2
Successfully built ba6a0e7c95f2
Successfully tagged gcr.io/zlc-test-2017/mnist-train:latest
Using default tag: latest
The push refers to repository [gcr.io/zlc-test-2017/mnist-train]
a642f7b967ae: Preparing
7a3361502ecc: Preparing
097e070097db: Preparing
e95ae1c1e1a8: Preparing
e43210c84711: Preparing
74dfe3df0c94: Preparing
8e29486d090c: Preparing
76bfe8e7e45c: Preparing
3779360d2582: Preparing
9f10818f1f96: Preparing
27502392e386: Preparing
c95d2191d777: Preparing
74dfe3df0c94: Waiting
8e29486d090c: Waiting
76bfe8e7e45c: Waiting
3779360d2582: Waiting
9f10818f1f96: Waiting
27502392e386: Waiting
c95d2191d777: Waiting
e95ae1c1e1a8: Pushed
a642f

## Verify the docker image has been pushed

In [22]:
%%bash

gcloud container images list

NAME
gcr.io/zlc-test-2017/mnist-train


Only listing images in gcr.io/zlc-test-2017. Use --repository to list images in other repositories.


## Update `image` and `args` in `tfjob.yaml`

In [23]:
%%writefile lab-files/tfjob.yaml

apiVersion: kubeflow.org/v1
kind: TFJob
metadata:
  name: multi-worker
spec:
  cleanPodPolicy: None
  tfReplicaSpecs:
    Worker:
      replicas: 3
      template:
        spec:
          containers:
            - name: tensorflow
              image: gcr.io/zlc-test-2017/mnist-train
              args:
                - --epochs=5
                - --steps_per_epoch=100
                - --per_worker_batch=64
                - --saved_model_path=gs://zlc-test-2017-bucket/saved_model_dir
                - --checkpoint_path=gs://zlc-test-2017-bucket/checkpoints

Overwriting lab-files/tfjob.yaml


In [24]:
%%bash

pygmentize -l yaml lab-files/tfjob.yaml

[94mapiVersion[39;49;00m: kubeflow.org/v1
[94mkind[39;49;00m: TFJob
[94mmetadata[39;49;00m:
  [94mname[39;49;00m: multi-worker
[94mspec[39;49;00m:
  [94mcleanPodPolicy[39;49;00m: None
  [94mtfReplicaSpecs[39;49;00m:
    [94mWorker[39;49;00m:
      [94mreplicas[39;49;00m: 3
      [94mtemplate[39;49;00m:
        [94mspec[39;49;00m:
          [94mcontainers[39;49;00m:
            - [94mname[39;49;00m: tensorflow
              [94mimage[39;49;00m: gcr.io/zlc-test-2017/mnist-train
              [94margs[39;49;00m:
                - --epochs=5
                - --steps_per_epoch=100
                - --per_worker_batch=64
                - --saved_model_path=gs://zlc-test-2017-bucket/saved_model_dir
                - --checkpoint_path=gs://zlc-test-2017-bucket/checkpoints


# Submit `TFJob`

In [25]:
%%bash

cd lab-files
kubectl apply -f tfjob.yaml

tfjob.kubeflow.org/multi-worker created


# Monitor `TFJob`

In [26]:
%%bash

sleep 1m  # wait to let pods start running
kubectl get pods

NAME                    READY   STATUS    RESTARTS   AGE
multi-worker-worker-0   1/1     Running   0          60s
multi-worker-worker-1   1/1     Running   0          60s
multi-worker-worker-2   1/1     Running   0          60s


In [27]:
%%bash

kubectl describe tfjob multi-worker

Name:         multi-worker
Namespace:    default
Labels:       <none>
Annotations:  <none>
API Version:  kubeflow.org/v1
Kind:         TFJob
Metadata:
  Creation Timestamp:  2021-07-18T04:24:29Z
  Generation:          1
  Managed Fields:
    API Version:  kubeflow.org/v1
    Fields Type:  FieldsV1
    fieldsV1:
      f:metadata:
        f:annotations:
          .:
          f:kubectl.kubernetes.io/last-applied-configuration:
      f:spec:
        .:
        f:cleanPodPolicy:
        f:tfReplicaSpecs:
          .:
          f:Worker:
            .:
            f:replicas:
            f:template:
              .:
              f:spec:
    Manager:      kubectl-client-side-apply
    Operation:    Update
    Time:         2021-07-18T04:24:29Z
    API Version:  kubeflow.org/v1
    Fields Type:  FieldsV1
    fieldsV1:
      f:spec:
        f:successPolicy:
        f:tfReplicaSpecs:
          f:Worker:
            f:restartPolicy:
            f:template:
              f:metadata:
            

In [28]:
%%bash

sleep 5m  # wait to let pods complete
kubectl get pods

NAME                    READY   STATUS      RESTARTS   AGE
multi-worker-worker-0   0/1     Completed   0          6m1s
multi-worker-worker-1   0/1     Completed   0          6m1s
multi-worker-worker-2   0/1     Completed   0          6m1s


In [29]:
%%bash

kubectl logs multi-worker-worker-0

2021-07-18 04:25:05.017695: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-18 04:25:05.017741: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
use distribute.MultiWorkerMirroredStrategy instead
2021-07-18 04:25:07.566738: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-18 04:25:07.567063: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-18 04:25:07.567086: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-18 04:25:07.567107: I tensorflow/stream_execu

In [30]:
%%bash

kubectl logs multi-worker-worker-1

2021-07-18 04:25:04.960336: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-18 04:25:04.960384: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
use distribute.MultiWorkerMirroredStrategy instead
2021-07-18 04:25:07.729757: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-18 04:25:07.729976: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-18 04:25:07.730000: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-18 04:25:07.730026: I tensorflow/stream_execu

In [31]:
%%bash

kubectl logs multi-worker-worker-2

2021-07-18 04:25:05.004436: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-18 04:25:05.004482: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
use distribute.MultiWorkerMirroredStrategy instead
2021-07-18 04:25:07.445321: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-18 04:25:07.445604: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-18 04:25:07.445630: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-18 04:25:07.445670: I tensorflow/stream_execu

# Clean up

## Delete the cluster

In [32]:
%%bash -s "$CLUSTER_NAME"

gcloud container clusters delete $1 --quiet

Deleting cluster my-k8s-cluster...
..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................done.
Deleted [https://container.googleapis.com/v1/projects/zlc-test-2017/zones/us-east4-a/clusters/my-k8s-cluster].


## Delete the Cloud Storage bucket

In [33]:
%%bash

gsutil -m rm -r gs://zlc-test-2017-bucket

Removing gs://zlc-test-2017-bucket/checkpoints/#1626582310915887...
Removing gs://zlc-test-2017-bucket/saved_model_dir/#1626582419007994...
Removing gs://zlc-test-2017-bucket/saved_model_dir/assets/#1626582423747345...
Removing gs://zlc-test-2017-bucket/saved_model_dir/variables/variables.data-00000-of-00001#1626582422119167...
Removing gs://zlc-test-2017-bucket/saved_model_dir/saved_model.pb#1626582424392744...
Removing gs://zlc-test-2017-bucket/saved_model_dir/variables/variables.index#1626582422661051...
Removing gs://zlc-test-2017-bucket/saved_model_dir/variables/#1626582419257911...
/ [7/7 objects] 100% Done                                                       
Operation completed over 7 objects.                                              
Removing gs://zlc-test-2017-bucket/...


## Delete the pushed docker image

In [34]:
%%bash

gcloud container images delete gcr.io/zlc-test-2017/mnist-train --force-delete-tags --quiet

Digests:
- gcr.io/zlc-test-2017/mnist-train@sha256:32bcf14192fdc5266dfaefabd3846404fd95f696fd3756904b0245195f0ba293
  Associated tags:
 - latest
Tags:
- gcr.io/zlc-test-2017/mnist-train:latest
Deleted [gcr.io/zlc-test-2017/mnist-train:latest].
Deleted [gcr.io/zlc-test-2017/mnist-train@sha256:32bcf14192fdc5266dfaefabd3846404fd95f696fd3756904b0245195f0ba293].
