### KFP를 활용해 Training PipeLine 만들기



In [12]:
#### New
from functools import partial
from kfp.components import create_component_from_func, InputPath, OutputArtifact, OutputPath
from kubernetes import client as k8s_client
from kfp.dsl import ContainerOp
from kubernetes.client.models import V1EnvVar, V1SecretKeySelector




@partial(
    create_component_from_func,
    base_image="python:3.9",
    packages_to_install=["pandas"],
)
def load_data(
    train_path: OutputPath("csv"),
    evaluation_path: OutputPath("csv"),
):

    import pandas as pd

    print("load_pandas")

    # load data from github
    df_train = pd.read_csv(
        "https://raw.github.com/yangoos57/Learning_kubeflow/main/mini_project/data/train.csv"
    )
    df_evaluation = pd.read_csv(
        "https://raw.github.com/yangoos57/Learning_kubeflow/main/mini_project/data/validation.csv"
    )
    print("Complete_loading_data_to_pandas")

    df_train.to_csv(train_path, index=False)
    df_evaluation.to_csv(evaluation_path, index=False)

    print("complete Loading Data")


from kfp.dsl import pipeline
import kfp.dsl as dsl

@pipeline(name="test_pipeline")
def test_pipeline():
    

    vop = dsl.VolumeOp(
        name="volume_creation",
        resource_name="mypvc",
        size="1Gi"
    )

    data = load_data()
    op = (
        ContainerOp(
            name="MARfile_Test",
            image="python:3.9",
            command=["/bin/sh", "-c"],
            arguments=[
                "pip install torch-model-archiver torchserve torch-workflow-archiver && ls-a"
            ],
            pvolumes={"/hello_world": vop.volume},
            artifact_argument_paths=[data.outputs['train']]
            
        )
        .add_env_variable(
            k8s_client.V1EnvVar(
                name="MINIO_URL", value="http://minio-service.kubeflow.svc.cluster.local:9000"
            )
        )
        .add_env_variable(k8s_client.V1EnvVar(name="MINIO_KEY", value="minio"))
        .add_env_variable(k8s_client.V1EnvVar(name="MINIO_SECRET", value="minio123"))
    )


import kfp

if __name__ == "__main__":
    kfp.compiler.Compiler().compile(test_pipeline, "test_pipeline_op_3.yaml")




In [11]:
import json
log_dir_uri=f"s3://{'torch'}",
minio_endpoint = "http://minio-service.kubeflow:9000"
pod_template_spec=json.dumps({
            "spec": {
                "containers": [{
                    "env": [
                        {
                            "name": "AWS_ACCESS_KEY_ID",
                            "value": "minio"
                        },
                        {
                            "name": "AWS_SECRET_ACCESS_KEY",
                            "value" : "minio123"
                        },
                        {
                            "name": "S3_ENDPOINT",
                            "value": f"{minio_endpoint}",
                        },
                        {
                            "name": "S3_USE_HTTPS",
                            "value": "0"
                        },
                        {
                            "name": "S3_VERIFY_SSL",
                            "value": "0"
                        },
                    ]
                }]
            }
        }),

In [12]:
pod_template_spec

('{"spec": {"containers": [{"env": [{"name": "AWS_ACCESS_KEY_ID", "key": "minio"}, {"name": "AWS_SECRET_ACCESS_KEY", "key": "minio123"}, {"name": "S3_ENDPOINT", "value": "http://minio-service.kubeflow:9000"}, {"name": "S3_USE_HTTPS", "value": "0"}, {"name": "S3_VERIFY_SSL", "value": "0"}]}]}}',)

In [4]:
@partial(
    create_component_from_func,
    base_image="679oose/basepython:1.0"
)
def train_model(
    train_path:InputPath("csv"),
    evaluation_path: InputPath("csv"),
    model_save_path: OutputPath("folder"),
):

    from transformers import (
        DistilBertForSequenceClassification,
        DistilBertTokenizer,
        Trainer,
        TrainingArguments,
        TrainerCallback
    )
    from datasets import Dataset

    # loading data
    train_dataset = Dataset.from_csv(train_path)
    evaluation_dataset = Dataset.from_csv(evaluation_path)

    # tokenizing
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize_function(item):
        return tokenizer(item["text"], padding="max_length", max_length=128, truncation=True)

    train = train_dataset.map(tokenize_function)
    evaluation = evaluation_dataset.map(tokenize_function)

    print('complete Tokenizing')

    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=len(set(train_dataset["label"]))
    )
    tra_arg = TrainingArguments(
        output_dir="test",
        num_train_epochs=1,
        logging_steps=5,
        evaluation_strategy="epoch",
        disable_tqdm=True,
        save_strategy = "no"
    )

    class myCallback(TrainerCallback):

        def on_log(self, args, state, control, logs=None, **kwargs):
            print(f'{state.global_step} Steps ')

    trainer = Trainer(
        model=model,
        args=tra_arg,
        train_dataset=train,
        eval_dataset=evaluation,
        callbacks=[myCallback]
    )

    trainer.train()
    trainer.save_model(model_save_path)



In [6]:
from kfp.dsl import pipeline

@pipeline(name="NLP_Pipeline")
def NLP_Pipeline():
    data = load_data()
    train_model(data.outputs['train'],data.outputs['evaluation'])


import kfp
if __name__ == "__main__":
    kfp.compiler.Compiler().compile(NLP_Pipeline, "NLP_Pipeline_1.2.yaml")