# Multi-Node Training on SageMaker Training job

In [1]:
# ## Update sagemaker python sdk version
!pip install -U sagemaker

Collecting sagemaker
  Downloading sagemaker-2.242.0-py3-none-any.whl.metadata (16 kB)
Downloading sagemaker-2.242.0-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.241.0
    Uninstalling sagemaker-2.241.0:
      Successfully uninstalled sagemaker-2.241.0
Successfully installed sagemaker-2.242.0


## Set model, Code and data

In [2]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name
print("sagemaker_default_bucket:", sagemaker_default_bucket)
print("sagemaker_region:", region)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


sagemaker_default_bucket: sagemaker-us-east-1-596899493901
sagemaker_region: us-east-1


## upload pretrain models to s3

In [4]:
!pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
Installing collected packages: huggingface_hub
Successfully installed huggingface_hub-0.29.3


In [5]:
# Code language: python
from huggingface_hub import snapshot_download
from pathlib import Path

local_cache_path = Path("./deepseek_coder")
local_cache_path.mkdir(exist_ok=True)

model_name = "deepseek-ai/deepseek-coder-6.7b-base"

# Only download pytorch checkpoint files
allow_patterns = ["*"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
)
model_snapshot_path = list(local_cache_path.glob("**/snapshots/*"))[0]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.86k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

In [6]:
!aws s3 cp {model_snapshot_path} s3://{sagemaker_default_bucket}/Foundation-Models/deepseek_coder --recursive

upload: deepseek_coder/models--deepseek-ai--deepseek-coder-6.7b-base/snapshots/ce2207a8bfef3ee92bd7dd4cc31c52cfa0046912/README.md to s3://sagemaker-us-east-1-596899493901/Foundation-Models/deepseek_coder/README.md
upload: deepseek_coder/models--deepseek-ai--deepseek-coder-6.7b-base/snapshots/ce2207a8bfef3ee92bd7dd4cc31c52cfa0046912/generation_config.json to s3://sagemaker-us-east-1-596899493901/Foundation-Models/deepseek_coder/generation_config.json
upload: deepseek_coder/models--deepseek-ai--deepseek-coder-6.7b-base/snapshots/ce2207a8bfef3ee92bd7dd4cc31c52cfa0046912/.gitattributes to s3://sagemaker-us-east-1-596899493901/Foundation-Models/deepseek_coder/.gitattributes
upload: deepseek_coder/models--deepseek-ai--deepseek-coder-6.7b-base/snapshots/ce2207a8bfef3ee92bd7dd4cc31c52cfa0046912/LICENSE to s3://sagemaker-us-east-1-596899493901/Foundation-Models/deepseek_coder/LICENSE
upload: deepseek_coder/models--deepseek-ai--deepseek-coder-6.7b-base/snapshots/ce2207a8bfef3ee92bd7dd4cc31c52cfa

## Setup for wandb

In [7]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Using cached GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.23.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Using cached gitdb-4.0.12-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Using cached smmap-5.0.2-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [8]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m407383787[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Submit Training job

In [None]:
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime


instance_count = 1
instance_type = 'ml.p4d.24xlarge'
# instance_type = 'ml.g5.48xlarge'  ## 8*24G
max_time = 200000

# Get the current time
current_time = datetime.now()

wandb.sagemaker_auth(path="llama_factory/")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'deepseek6-7B-finetune'
environment = {
    'NODE_NUMBER':str(instance_count),
    'MODEL_S3_PATH': f's3://{sagemaker_default_bucket}/Foundation-Models/deepseek_coder', # source model files
    'MODEL_LOCAL_PATH': '/tmp/pretrain_model',
    'OUTPUT_MODEL_S3_PATH': f's3://{sagemaker_default_bucket}/deepseek-coder-6.7b-base/finetuned_model/', # destination
}

estimator = PyTorch(entry_point='entry.py',
                            source_dir='llama_factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.1.0',
                            py_version='py310',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            max_run=max_time)

# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
#input_channel = {'train': f's3://{sagemaker_default_bucket}/datasets/qiandao/{version}/train.json'}
estimator.fit()

20250318102900


2025-03-18 10:29:08 Starting - Starting the training job
2025-03-18 10:29:08 Pending - Training job waiting for capacity............
2025-03-18 10:30:42 Pending - Preparing the instances for training...........................
2025-03-18 10:35:20 Downloading - Downloading input data...
2025-03-18 10:35:45 Downloading - Downloading the training image...............
2025-03-18 10:38:16 Training - Training image download completed. Training in progress.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2025-03-18 10:39:14,862 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-03-18 10:39:14,976 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-03-18 10:39:14,985 sagemaker_pytorch_container.training INFO     Block until all