In [67]:
%pip install -qqU s3fs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [64]:
sft_trainer_config = '''{
    "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
    "training_data_path": "epfl-dlab/gsm8k",
    "validation_data_path": "epfl-dlab/gsm8k",
    "output_dir": "/mnt/output/model",
    "save_model_dir": "/mnt/output/model",
    "num_train_epochs": 7.0,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "gradient_checkpointing": true,
    "save_strategy": "epoch",
    "learning_rate": 1e-5,
    "weight_decay": 0.0,
    "lr_scheduler_type": "cosine",
    "include_tokens_per_second": true,
    "data_formatter_template": "### Question:{{question}} \\n\\n### Answer: {{answer}}",
    "response_template": "\\n### Answer:",
    "use_flash_attn": true,
    "fast_kernels": [true, true, true],
    "peft_method": "lora",
    "r": 8,
    "lora_dropout": 0.05,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "lora_post_process_for_vllm": true
}'''

In [65]:
# Convert training configuration to base64 representation to pass it using environment variable
import base64


def encode_config(config):
    base64_bytes = base64.b64encode(config.encode("ascii"))
    txt = base64_bytes.decode("ascii")
    return txt

encoded_config = encode_config(sft_trainer_config)

In [66]:
from kubeflow.training import TrainingClient
from kubernetes.client import (
    V1EnvVar,
    V1EnvVarSource,
    V1SecretKeySelector,
    V1Volume,
    V1VolumeMount,
    V1PersistentVolumeClaimVolumeSource
)

job_name = "llama4"

tc = TrainingClient()

tc.create_job(
    job_kind="PyTorchJob",
    name=job_name,
    num_workers=1,
    num_procs_per_worker="auto",
    resources_per_worker={"gpu": 1},
    base_image="quay.io/modh/fms-hf-tuning:v2.8.2",
    env_vars=[
        V1EnvVar(name="HF_TOKEN", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="HF_TOKEN", name="hf-token"))),
        V1EnvVar(name="SFT_TRAINER_CONFIG_JSON_ENV_VAR", value=encoded_config),
    ],
    volumes=[
        V1Volume(name="trained-model",
                 persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name="trained-model")),
    ],
    volume_mounts=[
        V1VolumeMount(name="trained-model", mount_path="/mnt/output"),
    ],
)

In [68]:
logs, _ = tc.get_job_logs(job_name, follow=True)

[Pod llama4-master-0]: 	`--num_machines` was set to a value of `1`
[Pod llama4-master-0]: 	`--mixed_precision` was set to a value of `'no'`
[Pod llama4-master-0]: 	`--dynamo_backend` was set to a value of `'no'`
[Pod llama4-master-0]:   _reg = re.compile(f"(.*)\.({_name})\.weight")
[Pod llama4-master-0]:   _reg = re.compile(f"(.*)\.({_name})\.weight")
[Pod llama4-master-0]:   m = re.match(f"({router_name}|{expert_name})\.?(\d+)?\.?(\w+)?\.weight", rel_k)
Fetching 4 files: 100%|██████████| 4/4 [06:02<00:00, 90.69s/it] 
[Pod llama4-master-0]: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]
[Pod llama4-master-0]: The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~john

In [69]:
# Upload trained LoRA layer to the S3 bucket containing base model
import s3fs
import os


s3 = s3fs.S3FileSystem(
      key=os.environ["AWS_ACCESS_KEY_ID"],
      secret=os.environ["AWS_SECRET_ACCESS_KEY"],
      endpoint_url=os.environ["AWS_S3_ENDPOINT"]
   )
s3_path = os.environ["AWS_S3_BUCKET"] + "/meta-llama/Llama-3.1-8B-Instruct/adapter"
_ = s3.put("/opt/app-root/src/model", s3_path, recursive=True)