### KFP를 활용해 Training PipeLine 만들기

In [1]:
#### New
from functools import partial
from kfp.components import create_component_from_func, InputPath, OutputPath



@partial(
    create_component_from_func,
    packages_to_install=["pandas"],
)
def load_data(
    train_path: OutputArtifact("csv"),
    evaluation_path: OutputArtifact("csv"),
):


    import pandas as pd

    # load data from github
    df_train = pd.read_csv(
        "https://raw.github.com/yangoos57/Learning_kubeflow/main/mini_project/data/train.csv"
    )
    df_evaluation = pd.read_csv(
        "https://raw.github.com/yangoos57/Learning_kubeflow/main/mini_project/data/validation.csv"
    )
    
    df_train.to_csv(train_path, index=False)
    df_evaluation.to_csv(evaluation_path, index=False)

    print('complete Loading Data')


In [4]:
@partial(
    create_component_from_func,
    base_image="679oose/basepython:1.0"
)
def train_model(
    train_path:InputPath("csv"),
    evaluation_path: InputPath("csv"),
    model_save_path: OutputPath("folder"),
):

    from transformers import (
        DistilBertForSequenceClassification,
        DistilBertTokenizer,
        Trainer,
        TrainingArguments,
        TrainerCallback
    )
    from datasets import Dataset

    # loading data
    train_dataset = Dataset.from_csv(train_path)
    evaluation_dataset = Dataset.from_csv(evaluation_path)

    # tokenizing
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize_function(item):
        return tokenizer(item["text"], padding="max_length", max_length=128, truncation=True)

    train = train_dataset.map(tokenize_function)
    evaluation = evaluation_dataset.map(tokenize_function)

    print('complete Tokenizing')

    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=len(set(train_dataset["label"]))
    )
    tra_arg = TrainingArguments(
        output_dir="test",
        num_train_epochs=1,
        logging_steps=5,
        evaluation_strategy="epoch",
        disable_tqdm=True,
        save_strategy = "no"
    )

    class myCallback(TrainerCallback):

        def on_log(self, args, state, control, logs=None, **kwargs):
            print(f'{state.global_step} Steps ')

    trainer = Trainer(
        model=model,
        args=tra_arg,
        train_dataset=train,
        eval_dataset=evaluation,
        callbacks=[myCallback]
    )

    trainer.train()
    trainer.save_model(model_save_path)



In [None]:
from kfp.dsl import ContainerOp

op = ContainerOp(name='foo',
                      image='' % tag,
                      # pass in init_container list
                      init_containers=[dsl.UserContainer('print', 'busybox:latest', command='echo "hello"')],
                      # pass in sidecars list
                      sidecars=[dsl.Sidecar('print', 'busybox:latest', command='echo "hello"')],
                      # pass in k8s container kwargs
                      container_kwargs={'env': [V1EnvVar('foo', 'bar')]},
  )

In [6]:
from kfp.dsl import pipeline

@pipeline(name="NLP_Pipeline")
def NLP_Pipeline():
    data = load_data()
    train_model(data.outputs['train'],data.outputs['evaluation'])


import kfp
if __name__ == "__main__":
    kfp.compiler.Compiler().compile(NLP_Pipeline, "NLP_Pipeline_1.2.yaml")