# Kubernetes

In [17]:
maybe() {
    "$@" > .last_maybe 2>&1 || true
}

# Kubernetes

- Docker provides individual containers on a local machine
- Kubernetes manages collections of running containers across a cluster/datacenter
- also provides networking, storage, monitoring, service discovery

# The Cluster

A cluster with 6 CPU nodes and 8 GPU nodes (running on Google GCE).

In [1]:
# make sure we have a running cluster
kubectl get nodes

NAME                                         STATUS   ROLES    AGE   VERSION
gke-tmb-cluster-default-pool-bbd84174-14xz   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-default-pool-bbd84174-7dhh   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-default-pool-bbd84174-9rq8   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-default-pool-bbd84174-g34c   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-default-pool-bbd84174-ss8w   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-default-pool-bbd84174-x6vg   Ready    <none>   47m   v1.13.11-gke.14
gke-tmb-cluster-gpus-7ccf9a63-13q0           Ready    <none>   45m   v1.13.11-gke.14
gke-tmb-cluster-gpus-7ccf9a63-3nv8           Ready    <none>   44m   v1.13.11-gke.14
gke-tmb-cluster-gpus-7ccf9a63-69jq           Ready    <none>   45m   v1.13.11-gke.14
gke-tmb-cluster-gpus-7ccf9a63-9pk7           Ready    <none>   45m   v1.13.11-gke.14
gke-tmb-cluster-gpus-7ccf9a63-f3p4           Ready    <none>   44m   v1.1

In [2]:
kubectl delete jobs --all
kubectl delete pods --all

No resources found
No resources found


# Pods

- Kubernetes groups containers into _pods_
- (Docker container = whale, Pod = group of whales)
- specifications are written in YAML or JSON

In [20]:
maybe kubectl delete pod/mypod
kubectl apply -f - <<'EOF'
apiVersion: v1
kind: Pod
metadata:
  name: mypod
spec:
  containers:
  - name: mypod
    image: gcr.io/research-191823/bigdata19
    command: ["nvidia-smi"]
    resources:
      limits:
        nvidia.com/gpu: "1"
  restartPolicy: Never
EOF

pod/mypod created


# Pod Status and Logs

The Kubernetes runtime keeps track of pod status and logs.

In [4]:
kubectl get pods

NAME    READY   STATUS              RESTARTS   AGE
mypod   0/1     ContainerCreating   0          0s


In [5]:
sleep 15

In [7]:
kubectl logs pod/mypod

Thu Dec 12 17:52:22 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    12W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Debugging Pod Startup Problems

Sometimes pods don't get scheduled (never start running). Here are some tricks to debug this.

In [8]:
# sometimes pods don't schedule; there is tons of info
kubectl describe pod/mypod | sed 10q
kubectl describe pod/mypod | echo ... $(wc -l) ...

Name:               mypod
Namespace:          default
Priority:           0
PriorityClassName:  <none>
Node:               gke-tmb-cluster-gpus-7ccf9a63-69jq/10.138.0.57
Start Time:         Thu, 12 Dec 2019 09:50:00 -0800
Labels:             <none>
Annotations:        kubectl.kubernetes.io/last-applied-configuration:
                      {"apiVersion":"v1","kind":"Pod","metadata":{"annotations":{},"name":"mypod","namespace":"default"},"spec":{"containers":[{"command":["nvid...
                    kubernetes.io/limit-ranger: LimitRanger plugin set: cpu request for container mypod
... 60 ...


In [9]:
# the Events: section usually tells you why a job didn't get assigned to a node

kubectl describe pod/mypod | grep -A100 Events:

Events:
  Type    Reason     Age    From                                         Message
  ----    ------     ----   ----                                         -------
  Normal  Scheduled  2m49s  default-scheduler                            Successfully assigned default/mypod to gke-tmb-cluster-gpus-7ccf9a63-69jq
  Normal  Pulling    2m48s  kubelet, gke-tmb-cluster-gpus-7ccf9a63-69jq  pulling image "gcr.io/research-191823/bigdata19"
  Normal  Pulled     49s    kubelet, gke-tmb-cluster-gpus-7ccf9a63-69jq  Successfully pulled image "gcr.io/research-191823/bigdata19"
  Normal  Created    27s    kubelet, gke-tmb-cluster-gpus-7ccf9a63-69jq  Created container
  Normal  Started    27s    kubelet, gke-tmb-cluster-gpus-7ccf9a63-69jq  Started container


In [10]:
# nodes also have descriptions (even longer)

node=$(kubectl get nodes | awk '/gpus/{print $1; exit}')
kubectl describe node/$node | sed 10q
kubectl describe node/$node | echo ... $(wc -l) ...

Name:               gke-tmb-cluster-gpus-7ccf9a63-13q0
Roles:              <none>
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/fluentd-ds-ready=true
                    beta.kubernetes.io/instance-type=n1-standard-16
                    beta.kubernetes.io/os=linux
                    cloud.google.com/gke-accelerator=nvidia-tesla-t4
                    cloud.google.com/gke-nodepool=gpus
                    cloud.google.com/gke-os-distribution=cos
                    failure-domain.beta.kubernetes.io/region=us-west1
... 94 ...


In [11]:
# you want to make sure that nodes have the right allocatable resources
kubectl describe node/$node | grep -A10 Allocatable:

Allocatable:
 attachable-volumes-gce-pd:  127
 cpu:                        15890m
 ephemeral-storage:          47093746742
 hugepages-2Mi:              0
 memory:                     56288600Ki
 nvidia.com/gpu:             1
 pods:                       110
System Info:
 Machine ID:                 2d2db73e820f61ce6b21fdf0cbe7d3f7
 System UUID:                2D2DB73E-820F-61CE-6B21-FDF0CBE7D3F7


In [12]:
# also make sure there are resources available
kubectl describe node/$node | grep -A10 Allocated

Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource                   Requests    Limits
  --------                   --------    ------
  cpu                        400m (2%)   1050m (6%)
  memory                     210Mi (0%)  510Mi (0%)
  ephemeral-storage          0 (0%)      0 (0%)
  attachable-volumes-gce-pd  0           0
  nvidia.com/gpu             0           0
Events:
  Type    Reason                   Age                From                                            Message


In [13]:
# nodes can be prevented from scheduling jobs by "taints"
kubectl describe node/$node | grep -A2 Taints:

Taints:             nvidia.com/gpu=present:NoSchedule
Unschedulable:      false
Conditions:


In [14]:
# only pods that tolerate the taints are scheduled
kubectl describe pod/mypod | grep -A5 Tolerations:

Tolerations:     node.kubernetes.io/not-ready:NoExecute for 300s
                 node.kubernetes.io/unreachable:NoExecute for 300s
                 nvidia.com/gpu:NoSchedule
Events:
  Type    Reason     Age    From                                         Message
  ----    ------     ----   ----                                         -------


In [15]:
kubectl delete pods --all

pod "mypod" deleted


# Jobs

Jobs are like batch queuing. Job specs are a wrapper around pod specs.

In [22]:
maybe kubectl delete job/myjob
kubectl apply -f - <<'EOF'
apiVersion: batch/v1
kind: Job
metadata:
  name: myjob
  labels:
    app: bigdata19
spec:
  backoffLimit: 0
  template:
    # below is a regular Pod spec
    spec:
      containers:
        - name: myjob
          image: gcr.io/research-191823/bigdata19
          command:
            - "/bin/bash"
            - "-c"
            - |
              nvidia-smi
          stdin: true
          tty: true
          resources:
            limits:
              nvidia.com/gpu: "1"
      restartPolicy: Never
EOF

job.batch/myjob created


In [23]:
sleep 15

In [24]:
kubectl logs job.batch/myjob

Thu Dec 12 17:54:44 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W /  70W |      0MiB / 15079MiB |      5%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [25]:
kubectl get jobs

NAME    COMPLETIONS   DURATION   AGE
myjob   1/1           2s         22s


In [26]:
kubectl delete jobs --all

job.batch "myjob" deleted


# Configmaps

In [27]:
# configmaps are little mountable file systems, for config information and scripts
# we put our Python scripts there
maybe kubectl delete configmap files
kubectl create configmap files \
--from-file=training.py=training.py \
--from-file=helpers.py=helpers.py

configmap/files created


# Running a Training Job

In [28]:
# with the scripts transferred, let's run actual training
# note the use of multi-line quoting for the shell script
maybe kubectl delete job/myjob
kubectl apply -f - <<'EOF'
apiVersion: batch/v1
kind: Job
metadata:
  name: myjob
  labels:
    app: bigdata19
spec:
  backoffLimit: 0
  template:
    spec:
      containers:
        - name: myjob
          image: gcr.io/research-191823/bigdata19
          command:
            - "/bin/bash"
            - "-c"
            - |
              cp /files/*.py .
              python3 training.py
          stdin: true
          tty: true
          resources:
            limits:
              nvidia.com/gpu: "1"
          volumeMounts:
            - mountPath: /files
              name: files
      restartPolicy: Never
      volumes:
        - configMap:
            name: files
          name: files
EOF

job.batch/myjob created


# Training Job

In [29]:
kubectl get jobs

NAME    COMPLETIONS   DURATION   AGE
myjob   0/1           11s        11s


In [30]:
sleep 30

In [32]:
kubectl logs job/myjob

Thu Dec 12 17:55:27 UTC 2019; myjob-j2pfp; root; /workspace; GPU 0: Tesla T4 (UUID: GPU-e3b63d8c-056b-140d-43e1-de274722818d); 
creating resnet50
        0 bs   128 per sample loss 5.53e-02 loading 7.77e-03 training 1.45e-02
     1024 bs   128 per sample loss 5.53e-02 loading 4.61e-03 training 7.99e-03
     1920 bs   128 per sample loss 5.52e-02 loading 3.35e-03 training 5.45e-03
     2816 bs   128 per sample loss 5.50e-02 loading 2.74e-03 training 4.25e-03
     3712 bs   128 per sample loss 5.49e-02 loading 2.46e-03 training 3.70e-03
     4608 bs   128 per sample loss 5.48e-02 loading 2.38e-03 training 3.45e-03
     5504 bs   128 per sample loss 5.46e-02 loading 2.28e-03 training 3.37e-03
     6400 bs   128 per sample loss 5.45e-02 loading 2.28e-03 training 3.36e-03
     7296 bs   128 per sample loss 5.44e-02 loading 2.22e-03 training 3.39e-03


In [None]:
kubectl delete jobs --all || true
kubectl delete pods --all || true

job.batch "myjob" deleted
pod "myjob-j2pfp" deleted
pod "mypod" deleted


# Kubernetes

- a way of running services and jobs on a cluster of machines
- configurations are given as JSON or YAML files (or via APIs)
- both CPUs and GPUs supported