# T5 LLM Fine-Tuning with DeepSpeed and Kubeflow Trainer


This Notebook will fine-tune Text-to-Text Transfer Transformer (T5) with Wikihow dataset for text summarization using Kubeflow TrainJob and DeepSpeed.

Pretrained T5 model: https://huggingface.co/google-t5/t5-base

Wikihow dataset: https://huggingface.co/datasets/sentence-transformers/wikihow

This Notebook will use **8 x V100 NVIDIA GPUs**, to fine-tune T5 model on 2 nodes (every node has 4 GPUs).

**TODO (andreyvelich)**: Currently, to run this Notebook you have to manualy update the container resources in the ClusterTrainingRuntime, since we don't propogate TrainJob's `resources_per_node` to the JobSet

## Install the Kubeflow SDK

You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:

In [None]:
# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python

## Create Script to Fine-Tune T5 with DeepSpeed

We need to wrap our fine-tuning script into a function to create Kubeflow TrainJob.

In [None]:
def deepspeed_train_t5(args):
    import os
    import time
    import boto3
    import torch
    import torch.distributed as dist
    from torch.utils.data.distributed import DistributedSampler
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    from datasets import Dataset
    import deepspeed
    import numpy as np

    # Initialize distributed environment.
    deepspeed.init_distributed(dist_backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])

    # Define the Wikihow dataset class
    class wikihow(torch.utils.data.Dataset):
        def __init__(
            self,
            tokenizer,
            num_samples,
            input_length,
            output_length,
        ):
            self.dataset = Dataset.from_csv(args["DATASET_URL"])
            self.dataset = self.dataset.select(list(range(0, num_samples)))
            self.input_length = input_length
            self.tokenizer = tokenizer
            self.output_length = output_length

        def __len__(self):
            return self.dataset.shape[0]

        def clean_text(self, text):
            # Dataset contains empty values.
            if text is None:
                return ""
            text = text.replace("Example of text:", "")
            text = text.replace("Example of Summary:", "")
            text = text.replace("\n", "")
            text = text.replace("``", "")
            text = text.replace('"', "")

            return text

        def convert_to_features(self, example_batch):
            input_ = self.clean_text(example_batch["text"])
            target_ = self.clean_text(example_batch["headline"])

            source = self.tokenizer(
                input_,
                max_length=self.input_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            targets = self.tokenizer(
                target_,
                max_length=self.output_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )

            return source, targets

        def __getitem__(self, index):
            source, targets = self.convert_to_features(self.dataset[index])
            return {
                "source_ids": source["input_ids"].squeeze(),
                "source_mask": source["attention_mask"].squeeze(),
                "target_ids": targets["input_ids"].squeeze(),
                "target_mask": targets["attention_mask"].squeeze(),
            }

    # Download model and tokenizer
    if dist.get_rank() == 0:
        print("-" * 100)
        print("Downloading T5 Model")
        print("-" * 100)

    model = T5ForConditionalGeneration.from_pretrained(args["MODEL_NAME"])
    tokenizer = T5Tokenizer.from_pretrained(args["MODEL_NAME"])

    # Download dataset.
    dataset = wikihow(tokenizer, num_samples=1500, input_length=512, output_length=150)
    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=4, sampler=DistributedSampler(dataset)
    )

    # Define DeepSpeed configuration.
    # Train batch size = micro batch size * gradient steps * GPUs (e.g. 2 x 1 x 8 = 16).
    ds_config = {
        "train_micro_batch_size_per_gpu": 2,
        "gradient_accumulation_steps": 1,
        "fp16": {"enabled": True},  # Enable mixed precision
        "optimizer": {
            "type": "AdamW",
            "params": {"lr": 0.002},
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": 0.001,
                "warmup_num_steps": 1000,
            },
        },
    }

    # Initialize model with DeepSpeed.
    model, _, _, _ = deepspeed.initialize(
        config=ds_config,
        model=model,
        model_parameters=model.parameters(),
    )

    # Start training process.
    if dist.get_rank() == 0:
        print("-" * 100)
        print("Starting DeepSpeed distributed training...")
        print("-" * 100)

    t0 = time.time()
    for epoch in range(1, 3):
        losses = []
        for batch_idx, batch in enumerate(train_loader):
            for key in batch.keys():
                batch[key] = batch[key].to(local_rank)
            # Forward pass.
            output = model(
                input_ids=batch["source_ids"],
                attention_mask=batch["source_mask"],
                labels=batch["target_ids"],
            )
            loss = output.loss

            # Run backpropagation.
            model.backward(loss)
            # Weight updates.
            model.step()
            losses.append(loss.item())
            if batch_idx % 10 == 0 and dist.get_rank() == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(batch),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

        if dist.get_rank() == 0:
            print("-" * 100)
            print("Average Train Loss: {0:.4f}".format(np.mean(losses)))
            print("-" * 100)

    # Export model to S3.
    HOME_PATH = "/home/mpiuser"
    model.save_checkpoint(save_dir=HOME_PATH)

    if dist.get_rank() == 0:
        print("-" * 100)
        print(f"DeepSpeed training time: {int(time.time() - t0)} seconds")
        print("-" * 100)

        print("Upload T5 model to S3")
        file_path = os.path.join(HOME_PATH, "global_step94/mp_rank_00_model_states.pt")
        bucket = boto3.resource("s3").Bucket(args["BUCKET"])
        bucket.upload_file(file_path, f"deepspeed/{file_path}")

## List Available Kubeflow Trainer Runtimes


Get available Kubeflow Trainer Runtimes with the `list_runtimes()` API.

You can inspect Runtime details, including the name, framework, entry point, and number of accelerators.

- Runtimes with **CustomTrainer**: You must write the training script within the function.

- Runtimes with **BuiltinTrainer**: You can configure settings (e.g., LoRA Config) for LLM fine-tuning Job.


In [2]:
from kubeflow.trainer import TrainerClient, CustomTrainer

for r in TrainerClient().list_runtimes():
    print(f"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\n")
    print(f"Entrypoint: {r.trainer.entrypoint[:3]}\n")
    print(f"Runtime Accelerators: {r.trainer.accelerator_count} x {r.trainer.accelerator}")

    if r.name == "deepspeed-distributed":
        deepspeed_runtime = r

Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer

Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']

Runtime Accelerators: 4 x gpu-tesla-v100-16gb


## Create TrainJob for Distributed Training

Use the `train()` API to scale the training code across 2 Nodes and 8 GPUs.

Don't forget to update the S3 bucket name.

In [None]:
MODEL_NAME = "t5-base"
BUCKET_NAME = "TODO: add your bucket here"
args = {
    "DATASET_URL": "https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowAll.csv",
    "MODEL_NAME": MODEL_NAME,
    "BUCKET": BUCKET_NAME
}

job_id = TrainerClient().train(
    trainer=CustomTrainer(
        func=deepspeed_train_t5,
        func_args=args,
        packages_to_install=["boto3"], # Custom packages to install at runtime.
        num_nodes=2,
    ),
    runtime=deepspeed_runtime,
)

In [4]:
# Train API generates a random TrainJob id.
job_id

'f30e8ee53855'

## Check the TrainJob Info

Use the `list_jobs()` and `get_job()` APIs to get information about created TrainJob and its steps.

In [5]:
for job in TrainerClient().list_jobs():
    print(f"TrainJob: {job.name}, Status: {job.status}, Created at: {job.creation_timestamp}")

TrainJob: f30e8ee53855, Status: Created, Created at: 2025-03-22 04:19:34+00:00


In [6]:
# We execute mpirun command on node-0, which functions as the MPI Launcher node.
for c in TrainerClient().get_job(name=job_id).steps:
    print(f"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\n")

Step: node-0, Status: Running, Devices: gpu x 4

Step: node-1, Status: Running, Devices: gpu x 4



## Get the TrainJob Logs

Use the `get_job_logs()` API to retrieve the TrainJob logs.

Since we distribute the dataset accross 8 GPUs (2 nodes x 4 GPUs), each rank processes `round(1500 / 8) = 160` samples.

In [12]:
_ = TrainerClient().get_job_logs(name=job_id, follow=True)

[node-0]: --------------------------------------------------------------------------
[node-0]: PMIx was unable to find a usable compression library
[node-0]: on the system. We will therefore be unable to compress
[node-0]: large data streams. This may result in longer-than-normal
[node-0]: startup times and larger memory footprints. We will
[node-0]: continue, but strongly recommend installing zlib or
[node-0]: a comparable compression library for better user experience.
[node-0]: to your PMIx MCA default parameter file, or by adding
[node-0]: --------------------------------------------------------------------------
[node-0]:  1 more process has sent help message help-pcompress.txt / unavailable
[node-0]:  1 more process has sent help message help-pcompress.txt / unavailable
[node-0]: [2025-03-22 04:20:03,344] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[node-0]: [2025-03-22 04:20:03,344] [INFO] [real_accelerator.py:222:get_accelerator

## Download the Trained Model

Finally, download fine-tuned model from S3 for evaluations.

In [13]:
import boto3

bucket = boto3.resource("s3").Bucket(BUCKET_NAME)
bucket.download_file("deepspeed/global_step94/mp_rank_00_model_states.pt", "deepspeed_model.pt")

## Evaluate Fine-Tuned T5 Model

After model is downloaded, you can convert it into a `state_dict` and load into the HuggingFace pipeline.

The T5 model performs well for NLP tasks such as summarization, translation, and text classification.

In the example below, we'll demonstrate how to use a fine-tuned version of the T5 model to summarize documentation related to the Kubeflow Trainer project.

In [None]:
import torch
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load the DeepSpeed checkpoint to state dict (DeepSpeed stores it inside 'module')
ds_state_dict = torch.load("deepspeed_model.pt", map_location="cpu")["module"]

# Load state dict into HuggingFace model.
config = AutoConfig.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_config(config)
model.load_state_dict(ds_state_dict)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="pt")

text = """
summarize: In Kubeflow Trainer you can integrate other ML libraries such as HuggingFace,
DeepSpeed, or Megatron-LM with Kubeflow Trainer to orchestrate their ML training on Kubernetes.
Kubeflow Trainer allows you to effortlessly develop your LLMs with the Kubeflow Python SDK
and build Kubernetes-native Training Runtimes with Kubernetes Custom Resources APIs.
Kubeflow Trainer is a Kubernetes-native project designed for large language models (LLMs)
fine-tuning and enabling scalable, distributed training of machine learning (ML)
models across various frameworks, including PyTorch, JAX, TensorFlow, and XGBoost.
"""

summarizer(text, min_length=5, max_length=100)

Device set to use mps:0


[{'summary_text': 'Kubeflow Trainer is a Kubernetes-native project designed for large language models (LLMs) .'}]