## About this Jupyter Notebook

@author: Yingding Wang\
@updated: 28.07.2023

This notebook demonstrate an example of writing tensorboard log into an minio bucket inside a kubeflow v1 component.

## Install KFP Python SDK to build a V1 pipeline

Build KF pipeline with python SDK: https://www.kubeflow.org/docs/components/pipelines/sdk/build-pipeline/
Current KFP python SDK version on pypi.org: https://pypi.org/project/kfp/

In [1]:
import sys
!{sys.executable} -m pip install --upgrade --user kfp==1.8.22



## Restart the Kernel
After the installation of KFP python SDK, the notebook kernel must be restarted.

In [2]:
from platform import python_version
print (f"current platform python version: {python_version()}")

current platform python version: 3.8.10


In [3]:
# run kubectl command line to see the quota in the name space
!kubectl describe quota

Name:                                                                 kf-resource-quota
Namespace:                                                            kubeflow-kindfor
Resource                                                              Used    Hard
--------                                                              ----    ----
cpu                                                                   2225m   36
csi-s3.storageclass.storage.k8s.io/persistentvolumeclaims             0       10
csi-s3.storageclass.storage.k8s.io/requests.storage                   0       2Ti
kubeflow-nfs-csi.storageclass.storage.k8s.io/persistentvolumeclaims   4       20
kubeflow-nfs-csi.storageclass.storage.k8s.io/requests.storage         45Gi    4Ti
memory                                                                6702Mi  520Gi
microk8s-hostpath.storageclass.storage.k8s.io/persistentvolumeclaims  0       5
microk8s-hostpath.storageclass.storage.k8s.io/requests.storage        0       20Gi
minio

## Getting familiar with Jupyter Notebook ENV 

In [4]:
# examing the kfp python sdk version inside a KubeFlow v1.5.1
!{sys.executable} -m pip list | grep kfp

kfp                      1.8.22
kfp-pipeline-spec        0.1.16
kfp-server-api           1.8.5


## Define global variable

In [5]:
import kfp
client = kfp.Client()
NAMESPACE = client.get_user_namespace()
EXPERIMENT_NAME = 'tensorboard' # Name of the experiment in the KF webapp UI
EXPERIMENT_DESC = 'write tensorboard to minio bucket'
PREFIX = "minio_"

print(NAMESPACE)

kubeflow-kindfor


In [6]:
from dataclasses import dataclass
@dataclass
class Config:
    # python 3.8
    base_image: str = "tensorflow/tensorflow:2.12.0"
    tfio_version: str = "0.32.0" # "0.32.0" # https://github.com/tensorflow/io/pull/1343
    scikit_version: str = "1.3.0"
    boto3_version: str = "1.28.14"
    
config = Config()

## Creating KubeFlow component from python function

In [7]:
import kfp.dsl as dsl
from functools import partial
from kfp.dsl import (
    pipeline,
    ContainerOp
)
from kfp.components import (
    InputPath,
    OutputPath,
    create_component_from_func
)

## TensorBoard Write Component

In [8]:
@partial(
    create_component_from_func,
    output_component_file=f"{PREFIX}tensorboard_component.yaml",
    base_image=config.base_image,
    packages_to_install=[
        f"tensorflow-io=={config.tfio_version}",
        f"scikit-learn=={config.scikit_version}",
        f"boto3=={config.boto3_version}",
    ] # adding additional libs,
)
def tf_train():
    from sklearn.datasets import load_digits
    from sklearn.model_selection import train_test_split
    import tensorflow as tf
    import os, boto3
    from datetime import date
    today = date.today()
    fdate = date.today().strftime('%d_%m_%Y')
    
    
    print(f'S3_VERIFY_SSL: {os.environ["S3_VERIFY_SSL"]}')
    print(f'AWS_HTTPS_VERIFY_HOSTNAME: {os.environ["AWS_HTTPS_VERIFY_HOSTNAME"]}')
    
    """Loads the digits dataset as a tuple of flattened numpy arrays."""
    digits = load_digits()
    data = digits.images.reshape((len(digits.images), -1))
    X_train, X_test, y_train, y_test = train_test_split(
        data, digits.target, test_size=0.2, shuffle=False
    )
    
    model = tf.keras.Sequential([
         tf.keras.layers.Flatten(input_shape=(64,)),
         tf.keras.layers.Dense(16, activation=tf.nn.relu),
         tf.keras.layers.Dropout(0.1),
         tf.keras.layers.Dense(10, activation=tf.nn.softmax)
    ])
    
    # batch_size = 50
    # epochs = 1
    batch_size = 1200
    epochs = 250
    
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(
    #    log_dir=("s3://kind-mlflow/tfboard/logs/test/" + fdate + "/"),
    #    write_graph=False,
    #    update_freq = 'batch'
    #)
    # log_dir = f"s3://kind-mlflow/{fdate}"
    
    
    log_root="/tmp/output"
    log_dir = f"{log_root}/tensorboard/{fdate}"
    print(f"log_dir: {log_dir}")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir
    )
    
    # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    '''sparse categorical crossentropy'''
    model.compile(optimizer=tf.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
    
    model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, verbose=1,
        validation_data=(X_test, y_test),
        callbacks=[tensorboard_callback]
    )
    print(model.summary())
    
    def copy_folder_to_s3(local_dir, s3_bucket, s3_prefix, disable_ssl=True):
        """
        Copies a local directory recursively to an S3 bucket.
        https://www.developerfiles.com/upload-files-to-s3-with-python-keeping-the-original-folder-structure/

        Args:
            local_dir (str): The path to the local directory to copy.
            s3_bucket (str): The name of the S3 bucket to copy the directory to.
            s3_prefix (str): The prefix to use for the S3 objects.
            disable_ssl (bool): Whether to disable SSL verification.
        """
        session = boto3.session.Session(
               aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
               aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
        )
        s3 = session.resource(
            's3',
            endpoint_url = os.environ.get('S3_ENDPOINT'),
            verify=False
        )
        bucket = s3.Bucket(s3_bucket)
        
        for subdir, dirs, files in os.walk(local_dir):
            for file in files:
                full_path = os.path.join(subdir, file)
                with open(full_path, 'rb') as data:
                    print(f"copy {full_path[len(local_dir)+1:]}")
                    bucket.put_object(Key=full_path[len(local_dir)+1:], Body=data)

        #for root, directories, files in os.walk(local_dir):
        #    for file in files:
        #        file_path = os.path.join(root, file)
        #        key = os.path.join(s3_prefix, file_path)
        #        # s3.Object(CopySource=file_path, Bucket=s3_bucket, Key=key)
        #        s3.Object(bucket, key).put(Body=open(file_path, 'rb'))
    
    print("copy over the tensorboard logs")
    # copy
    s3_bucket = 'kind-mlflow'
    s3_prefix = 'tensorboard'
    copy_folder_to_s3(log_root, s3_bucket, s3_prefix)
    
    

## Define Helper Function
Difference between 2Gi and 2G
https://stackoverflow.com/questions/50804915/kubernetes-size-definitions-whats-the-difference-of-gi-and-g/50805048#50805048

In [9]:
def pod_resource_transformer(op: ContainerOp, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m'):
    """
    this function helps to set the resource limit for container operators
    op.set_memory_limit('1000Mi') = 1GB
    op.set_cpu_limit('1000m') = 1 cpu core
    """
    return op.set_memory_request(mem_req)\
            .set_memory_limit(mem_lim)\
            .set_cpu_request(cpu_req)\
            .set_cpu_limit(cpu_lim)

## Define Pipeline
* Intro Kubeflow pipeline: https://v1-5-branch.kubeflow.org/docs/components/pipelines/introduction/
* Kubeflow pipeline SDK v1: https://v1-5-branch.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/

In [10]:
@pipeline(
    name = EXPERIMENT_NAME,
    description = EXPERIMENT_DESC
)
def custom_pipeline(epochs: int):
    '''local variable'''
    no_artifact_cache = "P0D"
    # artifact_cache_today = "P1D"
    cache_setting = no_artifact_cache
    batch_size = 50
    # epochs = 100
    
    '''pipeline'''   
    train_task = tf_train()
    # 200 MB ram and 1 cpu
    train_task = pod_resource_transformer(train_task, mem_req="1000Mi", cpu_req="1000m")
    # set the download caching to be 1day, disable caching with P0D
    train_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    train_task.set_display_name("tf train")
    train_task.add_pod_label('add-minio-mlflow-secret', "true")
    # train_task.add_pod_annotation('add-minio-mlflow-secret', "true")

### (optional) pipeline compile step
use the following command to compile the pipeline

In [11]:
PIPE_LINE_FILE_NAME=f"{PREFIX}kfp_tensorboard_demo_pipeline"
kfp.compiler.Compiler().compile(custom_pipeline, f"{PIPE_LINE_FILE_NAME}.yaml")

### Create Experiment Run

create run label with current data time
```python
from datetime import datetime
from pytz import timezone as ptimezone
ts = datetime.strftime(datetime.now(ptimezone("Europe/Berlin")), "%Y-%m-%d %H-%M-%S")
print(ts)
```

Reference:
* https://stackoverflow.com/questions/25837452/python-get-current-time-in-right-timezone/25887393#25887393

In [12]:
from datetime import datetime
from pytz import timezone as ptimezone

def get_local_time_str(target_tz_str: str = "Europe/Berlin", format_str: str = "%Y-%m-%d %H-%M-%S") -> str:
    """
    this method is created since the local timezone is miss configured on the server
    @param: target timezone str default "Europe/Berlin"
    @param: "%Y-%m-%d %H-%M-%S" returns 2022-07-07 12-08-45
    """
    target_tz = ptimezone(target_tz_str) # create timezone, in python3.9 use standard lib ZoneInfo
    # utc_dt = datetime.now(datetime.timezone.utc)
    target_dt = datetime.now(target_tz)
    return datetime.strftime(target_dt, format_str)

In [13]:
# from kubernetes import client as k8s_client
pipeline_config = dsl.PipelineConf()

# pipeline_config.set_image_pull_secrets([k8s_client.V1ObjectReference(name=K8_GIT_SECRET_NAME, namespace=NAME_SPACE)])
# pipeline_config.set_image_pull_policy("Always")
pipeline_config.set_image_pull_policy("IfNotPresent")

pipeline_args = {}

In [14]:
RUN_NAME = f"{PREFIX}kfp_tensorboard_demo_pipeline {get_local_time_str()}"

# client = kfp.Client()
client.create_run_from_pipeline_func(
    pipeline_func=custom_pipeline,
    arguments = pipeline_args, #{}
    run_name = RUN_NAME,
    pipeline_conf=pipeline_config,
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

RunPipelineResult(run_id=c0e249cf-9ef2-4af5-9440-3c5bbd271f47)

## Create tensorboard

with the s3 path
`s3://<bucket_name>/tensorboard`

use the PodDefault `add-minio-<bucket_name>-secret` to 