# Multi-Node Training on SageMaker Training job

In [3]:
# ## Update sagemaker python sdk version
!pip install -U sagemaker huggingface_hub

Collecting sagemaker
  Using cached sagemaker-2.232.1-py3-none-any.whl.metadata (16 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)
Using cached sagemaker-2.232.1-py3-none-any.whl (1.6 MB)
Using cached huggingface_hub-0.25.1-py3-none-any.whl (436 kB)
Installing collected packages: huggingface_hub, sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.231.0
    Uninstalling sagemaker-2.231.0:
      Successfully uninstalled sagemaker-2.231.0
Successfully installed huggingface_hub-0.25.1 sagemaker-2.232.1


## Set model, Code and data

In [1]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name
print("sagemaker_default_bucket:", sagemaker_default_bucket)
print("sagemaker_region:", region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker_default_bucket: sagemaker-us-east-1-596899493901
sagemaker_region: us-east-1


## upload pretrain models to s3

In [5]:
# Code language: python
from huggingface_hub import snapshot_download
from pathlib import Path

local_cache_path = Path("./Qwen2.5_7B")
local_cache_path.mkdir(exist_ok=True)

model_name = "Qwen/Qwen2.5-7B-Instruct"
hf_token = "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"
# Only download pytorch checkpoint files
allow_patterns = ["*"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
    token = hf_token
)
model_snapshot_path = list(local_cache_path.glob("**/snapshots/*"))[0]

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.00k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

In [6]:
!aws s3 cp {model_snapshot_path} s3://{sagemaker_default_bucket}/Foundation-Models/Qwen2.5_7B_it --recursive

upload: Qwen2.5_7B/models--Qwen--Qwen2.5-7B-Instruct/snapshots/bb46c15ee4bb56c5b63245ef50fd7637234d6f75/generation_config.json to s3://sagemaker-us-east-1-596899493901/Foundation-Models/Qwen2.5_7B_it/generation_config.json
upload: Qwen2.5_7B/models--Qwen--Qwen2.5-7B-Instruct/snapshots/bb46c15ee4bb56c5b63245ef50fd7637234d6f75/config.json to s3://sagemaker-us-east-1-596899493901/Foundation-Models/Qwen2.5_7B_it/config.json
upload: Qwen2.5_7B/models--Qwen--Qwen2.5-7B-Instruct/snapshots/bb46c15ee4bb56c5b63245ef50fd7637234d6f75/.gitattributes to s3://sagemaker-us-east-1-596899493901/Foundation-Models/Qwen2.5_7B_it/.gitattributes
upload: Qwen2.5_7B/models--Qwen--Qwen2.5-7B-Instruct/snapshots/bb46c15ee4bb56c5b63245ef50fd7637234d6f75/README.md to s3://sagemaker-us-east-1-596899493901/Foundation-Models/Qwen2.5_7B_it/README.md
upload: Qwen2.5_7B/models--Qwen--Qwen2.5-7B-Instruct/snapshots/bb46c15ee4bb56c5b63245ef50fd7637234d6f75/LICENSE to s3://sagemaker-us-east-1-596899493901/Foundation-Models/Q

In [7]:
!rm -rf ./Qwen2.5_7B

## Setup for wandb

In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.18.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Using cached GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.15.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Using cached setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Using cached gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Using cached smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.3-py3-none-manylinux_2_17_x86_64.manylinux2014

In [3]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m407383787[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Submit Training job

In [9]:
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime


instance_count = 1
instance_type = 'ml.p4d.24xlarge'  ## 8*40G
max_time = 200000

# Get the current time
current_time = datetime.now()

wandb.sagemaker_auth(path="llama_factory/")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'Qwen25-7B-it-finetune'
environment = {
    'NODE_NUMBER':str(instance_count),
    'MODEL_S3_PATH': f's3://{sagemaker_default_bucket}/Foundation-Models/Qwen2.5_7B_it', # source model files
    'MODEL_LOCAL_PATH': '/tmp/pretrain_model',
    'OUTPUT_MODEL_S3_PATH': f's3://{sagemaker_default_bucket}/Qwen2.5_7B_it/QA_fake_sft/', # destination
}

estimator = PyTorch(entry_point='entry.py',
                            source_dir='llama_factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.1.0',
                            py_version='py310',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            max_run=max_time)

# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
#input_channel = {'train': f's3://{sagemaker_default_bucket}/datasets/qiandao/{version}/train.json'}
estimator.fit()

20241008090843


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: Qwen25-7B-it-finetune-2024-10-08-09-08-43-528


2024-10-08 09:08:50 Starting - Starting the training job
2024-10-08 09:08:50 Pending - Training job waiting for capacity......
2024-10-08 09:09:37 Pending - Preparing the instances for training..............................
2024-10-08 09:14:37 Downloading - Downloading input data...
2024-10-08 09:15:24 Downloading - Downloading the training image..................
2024-10-08 09:18:16 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-10-08 09:19:04,447 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-10-08 09:19:04,560 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-10-08 09:19:04,570 sagemaker_pytorch_container.training INFO     Block until al

In [17]:
./llama_factory/s5cmd sync s3://sagemaker-us-east-1-596899493901/Foundation-Models/meta_llama31_8_it/ ./

SyntaxError: invalid syntax (1685352357.py, line 1)

In [None]:
!