In [1]:
pip install s3fs

Collecting s3fs
  Downloading s3fs-2025.3.2-py3-none-any.whl.metadata (1.9 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.22.0-py3-none-any.whl.metadata (24 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.37.4,>=1.37.2 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.37.3-py3-none-any.whl.metadata (5.7 kB)
Downloading s3fs-2025.3.2-py3-none-any.whl (30 kB)
Downloading aiobotocore-2.22.0-py3-none-any.whl (78 kB)
Downloading aioitertools-0.12.0-py3-none-any.whl (24 kB)
Downloading botocore-1.37.3-py3-none-any.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m257.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aioitertools, botocore, aiobotocore, s3fs
  Attempting uninstall: botocore
    Found existing installation: botocore 1.37.34
    Uninstalling b

In [29]:
sft_trainer_config = '''{
    "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
    "training_data_path": "thesven/gsm8k-reasoning",
    "output_dir": "/mnt/output/model",
    "save_model_dir": "/mnt/output/model",
    "num_train_epochs": 1.0,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "gradient_checkpointing": true,
    "save_strategy": "epoch",
    "learning_rate": 1e-5,
    "weight_decay": 0.0,
    "lr_scheduler_type": "cosine",
    "include_tokens_per_second": true,
    "data_formatter_template": "### Question:{{question}} \\n\\n### Answer: {{answer}}",
    "response_template": "\\n### Answer:",
    "use_flash_attn": true,
    "fast_kernels": [true, true, true],
    "peft_method": "lora",
    "lora_post_process_for_vllm": true
}'''

In [30]:
import base64

def encode_config(config):
    base64_bytes = base64.b64encode(config.encode("ascii"))
    txt = base64_bytes.decode("ascii")
    return txt

encoded_config = encode_config(sft_trainer_config)

In [32]:
from kubeflow.training import TrainingClient
from kubernetes import client
from kubernetes.client import (
    V1EnvVar,
    V1EnvVarSource,
    V1SecretKeySelector,
    V1Volume,
    V1VolumeMount,
    V1PersistentVolumeClaimVolumeSource
)

job_name = "llama4"

tc = TrainingClient()

tc.create_job(
    job_kind="PyTorchJob",
    name=job_name,
    num_workers=1,
    num_procs_per_worker="auto",
    resources_per_worker={"gpu": 1},
    base_image="quay.io/modh/fms-hf-tuning:v2.8.2",
    env_vars=[
        V1EnvVar(name="HF_TOKEN", value_from=V1EnvVarSource(secret_key_ref=V1SecretKeySelector(key="HF_TOKEN", name="hf-token"))),
        V1EnvVar(name="SFT_TRAINER_CONFIG_JSON_ENV_VAR", value=encoded_config),
    ],
    volumes=[
        V1Volume(name="llama70",
                 persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name="llama70")),
        V1Volume(name="trained-model",
                 persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name="trained-model")),
    ],
    volume_mounts=[
        V1VolumeMount(name="llama70", mount_path="/mnt/model"),
        V1VolumeMount(name="trained-model", mount_path="/mnt/output"),
    ],
)

In [4]:
logs, _ = tc.get_job_logs(job_name, follow=True)

[Pod llama4-master-0]: 	`--num_machines` was set to a value of `1`
[Pod llama4-master-0]: 	`--mixed_precision` was set to a value of `'no'`
[Pod llama4-master-0]: 	`--dynamo_backend` was set to a value of `'no'`
[Pod llama4-master-0]:   _reg = re.compile(f"(.*)\.({_name})\.weight")
[Pod llama4-master-0]:   _reg = re.compile(f"(.*)\.({_name})\.weight")
[Pod llama4-master-0]:   m = re.match(f"({router_name}|{expert_name})\.?(\d+)?\.?(\w+)?\.weight", rel_k)
Fetching 4 files: 100%|██████████| 4/4 [00:16<00:00,  4.17s/it]
[Pod llama4-master-0]: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]
[Pod llama4-master-0]: The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnh

In [3]:
import s3fs
import os


s3 = s3fs.S3FileSystem(
      key=os.environ["AWS_ACCESS_KEY_ID"],
      secret=os.environ["AWS_SECRET_ACCESS_KEY"],
      endpoint_url=os.environ["AWS_S3_ENDPOINT"]
   )
s3_path = os.environ["AWS_S3_BUCKET"] + "/meta-llama/Llama-3.1-8B-Instruct/adapter"
_ = s3.put("/opt/app-root/src/trained-model-1/model", s3_path, recursive=True)

In [13]:
pip install openai

Collecting openai
  Downloading openai-1.78.0-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.78.0-py3-none-any.whl (680 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m680.4/680.4 kB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (351 kB)
Installing collected packages: jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.9.0 openai-1.78.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --u

In [27]:
import os
import httpx
from openai import OpenAI


client = OpenAI(
    http_client = httpx.Client(verify=False) 
)

completion = client.chat.completions.create(
  model="llama4",
  messages=[
    {"role": "user", "content": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"}
  ]
)

print(completion.choices[0].message.content)

To find out how many clips Natalia sold in May, we need to calculate half of the clips she sold in April. 

In April, Natalia sold 48 clips. 
Half of 48 is 48 / 2 = 24 clips.

So, in May, Natalia sold 24 clips. 

To find the total number of clips sold in April and May, we need to add the clips sold in both months. 

48 (April) + 24 (May) = 72 clips.

Natalia sold 72 clips altogether in April and May.
