In [99]:
pip install -U kubeflow-training[huggingface]==1.9.0 transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/7b/9f/92d3091c44cb19add044064af1bf1345cd35fbb84d32a3690f912800a295/transformers-4.48.1-py3-none-any.whl.metadata
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
def train_func():
    import os
    import logging
    from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        TrainingArguments,
        DataCollatorForLanguageModeling,
        Trainer,
    )
    from datasets import load_dataset
    from datasets.distributed import split_dataset_by_node
    from peft import LoraConfig, get_peft_model

    log_formatter = logging.Formatter(
        "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ"
    )
    logger = logging.getLogger(__file__)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_formatter)
    logger.addHandler(console_handler)
    logger.setLevel(logging.INFO)

    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path="bigscience/bloom-560m",
    )
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path="bigscience/bloom-560m",
    )

    # Freeze model parameters
    for param in model.parameters():
        param.requires_grad = False

    dataset = load_dataset("vicgalle/alpaca-gpt4")

    dataset = dataset.map(
        lambda x: tokenizer(x["output"], padding=True, truncation=True, max_length=128),
        batched=True,
        keep_in_memory=True
    )
    # Split the train data into 90% train and 10% eval (validation)
    train_valid_split = dataset["train"].train_test_split(test_size=0.1)
    train_data = train_valid_split["train"]
    eval_data = train_valid_split["test"]

    RANK = int(os.environ["RANK"])
    WORLD_SIZE = int(os.environ["WORLD_SIZE"])
    train_data = split_dataset_by_node(
        train_data,
        rank=RANK,
        world_size=WORLD_SIZE,
    )
    eval_data = split_dataset_by_node(
        eval_data,
        rank=RANK,
        world_size=WORLD_SIZE,
    )
    lora_config = LoraConfig(r=4, lora_alpha=16, lora_dropout=0.1, bias="none")
    model.enable_input_require_grads()
    model = get_peft_model(model, lora_config)

    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        args=TrainingArguments(output_dir="/tmp", per_device_train_batch_size=8, num_train_epochs=3, logging_dir="/logs", eval_strategy="epoch", save_strategy="no"),
    )

    trainer.data_collator = DataCollatorForLanguageModeling(
        tokenizer,
        pad_to_multiple_of=8,
        mlm=False,
    )

    # Train and save the model.
    trainer.train()
    trainer.save_model()
    logger.info("parallel_mode: '{0}'".format(trainer.args.parallel_mode))
    logger.info("is_model_parallel: '{0}'".format(trainer.is_model_parallel))
    logger.info("model_wrapped: '{0}'".format(trainer.model_wrapped))

In [2]:
from kubeflow.training import TrainingClient

In [3]:
TrainingClient().create_job(
   job_kind="PyTorchJob",
   name="pytorch-ddp",
   train_func=train_func,
   num_workers=2,
   num_procs_per_worker="auto",
   resources_per_worker={"gpu": 2},
   base_image="quay.io/modh/training:py311-cuda121-torch241",
)
