# Running stanford_alpaca on Amazon SageMaker

This is a sample code to run stanford_alpaca on Amazon SageMaker, for demo or research use only!

In [None]:
## Update sagemaker python sdk version
!pip install -U sagemaker

In [13]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
print(sagemaker_default_bucket)

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

sagemaker-us-east-1-427169985960


In [4]:
## check data
!head -n 20 ./src/train_dataset_en.jsonl

{"text":"Chocola: G-Good morning... to you...\n\nVanilla: Good morning...\n\nShigure: Good morning, Chocola, Vanilla.\nDid you get any sleep last night?\n\nChocola: Oh, y-yes... I got... sleep.\n\nVanilla: Mm-hmm, more than before...\n\nAzuki: Hey, ChocoVani. Your seats are over here, over here.\nC'mere and sit down.\n\nChocola: Oh, r-right...!\n\nVanilla: M-Mm-hmm, okay...\n\nShigure: Chocola and Vanilla, are you in the mood for milk this morning?\n\nChocola: Y-Yes... I would love some milk...\n\nVanilla: Me, too. I'm fine with milk...\n\nShigure: Hehe. Chocola, Vanilla, you two are already one of ours, you know.\nYou don't have to be so nervous around us.\n\nChocola: Y-Yes, um... erm, okay...\n\nVanilla: Uh-huh... Erm, uh-huh...\n\nCoconut: Y-You guys just got here, after all, right? You'll get used to it in no time, so don't worry!\nI'm new here at the Minaduki household myself, but everyone's been so kind!\n\nAzuki: You've never been shy from the start, Nuts. Any different would be

**Generate sample dataset for debug use.**

In [None]:
%%writefile src/sample_dataset.json
[
    xxxxx
]

## Download pretrained model from HuggingFace Hub

To avoid download model from Huggingface hub failure, we download first and push those model files to S3 bucket first.

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_cache_path = Path("./model")
local_cache_path.mkdir(exist_ok=True)

#1B5v12 model on huggingface
#https://huggingface.co/BlinkDL/rwkv-4-raven/blob/main/RWKV-4-Raven-1B5-v12-Eng98%25-Other2%25-20230520-ctx4096.pth

model_name = "RWKV/rwkv-raven-1b5"#

# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.model"]
#allow_patterns = ["RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
)

**Upload model files to S3**

In [None]:
# Get the model files path
import os
from glob import glob

local_model_path = None

paths = os.walk(r'./model')

for root, dirs, files in paths:
    for file in files:
        print(file)
        if file == 'config.json':
            print(os.path.join(root,file))
            local_model_path = str(os.path.join(root,file))[0:-11]
            print(local_model_path)
if local_model_path == None:
    print("Model download may failed, please check prior step!")

**Rewrite upload module**

In [6]:
!wget https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main/RWKV-4-Raven-1B5-v12-Eng98%25-Other2%25-20230520-ctx4096.pth

--2023-06-14 15:53:13--  https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main/RWKV-4-Raven-1B5-v12-Eng98%25-Other2%25-20230520-ctx4096.pth
Resolving huggingface.co (huggingface.co)... 108.138.64.87, 108.138.64.67, 108.138.64.89, ...
Connecting to huggingface.co (huggingface.co)|108.138.64.87|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/41/55/4155c7aaff64e0f4b926df1a8fff201f8ee3653c39ba67b31e4973ae97828633/6bbbffb3ee2372dfa9ef49c599e9a2bc0a01b94b6a264ba9bf5bd524fc38f723?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27RWKV-4-Raven-1B5-v12-Eng98%2525-Other2%2525-20230520-ctx4096.pth%3B+filename%3D%22RWKV-4-Raven-1B5-v12-Eng98%25-Other2%25-20230520-ctx4096.pth%22%3B&Expires=1687017194&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzQxLzU1LzQxNTVjN2FhZmY2NGUwZjRiOTI2ZGYxYThmZmYyMDFmOGVlMzY1M2MzOWJhNjdiMzFlNDk3M2FlOTc4Mjg2MzMvNmJiYmZmYjNlZTIzNzJkZmE5ZWY0OW

In [15]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket local_model_path=$local_model_path bash
pwd
chmod +x s5cmd
./s5cmd sync RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth s3://sagemaker-us-east-1-427169985960/RWKV/1B5/

#rm -rf model

/home/ec2-user/SageMaker/RWKV-on-amazon-sagemaker
cp RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth s3://sagemaker-us-east-1-427169985960/RWKV/1B5/RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth


In [None]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket local_model_path=$local_model_path bash

chmod +x s5cmd
./s5cmd sync ${local_model_path} s3://${sagemaker_default_bucket}/RWKV/1B5/

#rm -rf model

## Prepare a docker image

In [26]:
%%writefile Dockerfile
## You should change below region code to the region you used, here sample is use us-west-2
#From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04 
#From 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04 
#From 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04
FROM nvcr.io/nvidia/pytorch:23.02-py3
RUN pip3 install sagemaker-training

#Remove Cuda 11.8
#RUN apt-get -y purge cuda*
#RUN apt-get -y autoremove
#RUN apt-get -y autoclean
#RUN rm -rf /usr/local/cuda*

#install Cuda 12
#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
#RUN mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
#RUN wget https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_amd64.deb
#RUN dpkg -i cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_amd64.deb
#RUN cp /var/cuda-repo-ubuntu2004-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
#RUN apt-get update
#RUN apt-get -y install cuda

ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

RUN update-alternatives --display cuda
RUN update-alternatives --auto cuda

RUN python3 -m pip uninstall -y deepspeed 
#RUN python3 -m pip install deepspeed==0.7.0
RUN python3 -m pip install deepspeed 
RUN python3 -m pip install pytorch-lightning==1.9.0
## Install transfomers version which support LLaMaTokenizer
#RUN python3 -m pip install git+https://github.com/huggingface/transformers.git@68d640f7c368bcaaaecfc678f11908ebbd3d6176

## Make all local GPUs visible
ENV NVIDIA_VISIBLE_DEVICES="all"

Overwriting Dockerfile


In [27]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


**Build image and push to ECR.**

In [28]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-rwkv-demo"

In [29]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm

algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  3.085GB
Step 1/11 : FROM nvcr.io/nvidia/pytorch:23.02-py3
 ---> 7c3375e220ea
Step 2/11 : RUN pip3 install sagemaker-training
 ---> Using cache
 ---> a85f091728b8
Step 3/11 : ENV LANG=C.UTF-8
 ---> Using cache
 ---> 4e51217d1990
Step 4/11 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> be65ae8c46d8
Step 5/11 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 7490dc715d8b
Step 6/11 : RUN update-alternatives --display cuda
 ---> Using cache
 ---> 51d1c3f8d7cc
Step 7/11 : RUN update-alternatives --auto cuda
 ---> Using cache
 ---> 43274913e560
Step 8/11 : RUN python3 -m pip uninstall -y deepspeed
 ---> Using cache
 ---> 43c3abb8c48d
Step 9/11 : RUN python3 -m pip install deepspeed
 ---> Using cache
 ---> a75121332bf6
Step 10/11 : RUN python3 -m pip install pytorch-lightning==1.9.0
 ---> Using cache
 ---> ec883311c30d
Step 11/11 : ENV NVIDIA_VISIBLE_DEVICES="all"
 ---> Using cache
 ---> 2e5063853c12
Successfully built 2e506

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [30]:
!service docker stop
!service docker start

Redirecting to /bin/systemctl stop docker.service
Failed to stop docker.service: The name org.freedesktop.PolicyKit1 was not provided by any .service files
See system logs and 'systemctl status docker.service' for details.
  docker.socket
Redirecting to /bin/systemctl start docker.service
Failed to start docker.service: The name org.freedesktop.PolicyKit1 was not provided by any .service files
See system logs and 'systemctl status docker.service' for details.


### Modify Deepspeed config to save model properply.

We will set ```stage3_gather_16bit_weights_on_model_save``` to ```Ture```.

In [None]:
import json

ds_config_file = './src/stanford_alpaca/configs/default_offload_opt_param.json'
with open (ds_config_file, 'rb') as f:
    ds_config = json.load(f)
    f.close()
    
ds_config['zero_optimization']['stage3_gather_16bit_weights_on_model_save'] = True

with open(ds_config_file, 'w') as f:
    json.dump(ds_config, f, indent=2)
    f.close()

### Generate training entrypoint script

**Note: DO NOT CHANGE BELOW VAlUE OF "output_dir" and "cache_dir", keep it "/tmp/llama_out" and "/tmp".**

Below is just a testing to fine-tune on a sample dataset (just 8 samples), you could change ```data_path``` to your dataset for furthur fine tune.

For the dataset download, you could follow the way how to download pretrain model:
```
./s5cmd sync s3://$MODEL_S3_BUCKET/llama/pretrain/7B/* /tmp/llama_pretrain/
```

It is recommend to use the folder ```/tmp/dataset/```.

In [31]:
!cd src/ && git clone https://github.com/Blealtan/RWKV-LM-LoRA

Cloning into 'RWKV-LM-LoRA'...
remote: Enumerating objects: 1969, done.[K
remote: Counting objects: 100% (710/710), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 1969 (delta 639), reused 570 (delta 540), pack-reused 1259[K
Receiving objects: 100% (1969/1969), 11.20 MiB | 18.15 MiB/s, done.
Resolving deltas: 100% (1258/1258), done.


In [67]:
%%writefile src/train.sh
#!/bin/bash

chmod +x ./s5cmd
./s5cmd sync s3://$MODEL_S3_BUCKET/RWKV/1B5/* /tmp/rwkv/


cd RWKV-LM-LoRA/RWKV-v4neo/

#update-alternatives --display cuda
#update-alternatives --auto cuda
#to resolve issue https://github.com/BlinkDL/RWKV-LM/issues/129
#export CUDA_HOME=/usr/local/cuda
#export PATH=/usr/local/cuda/bin:$PATH
#export CPATH=/usr/local/cuda/include:$CPATH
#export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
#export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH

RWKV_CUDA_ON=1 python3 train.py  \
    --load_model "/tmp/rwkv/RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth" \
    --proj_dir "./" \
    --data_file "./test_text_document" \
    --data_type "binidx" \
    --vocab_size 50277  \
    --ctx_len 4096 \
    --epoch_save 100 \
    --epoch_count 10 \
    --n_layer 24 \
    --n_embd 2048 \
    --epoch_steps 1000 \
    --epoch_begin 0 \
    --micro_bsz 1  \
    --pre_ffn 0 \
    --head_qk 0 \
    --lr_init 1e-4 \
    --lr_final 1e-5 \
    --warmup_steps 0 \
    --beta1 0.9 \
    --beta2 0.999 \
    --adam_eps 1e-8 \
    --accelerator gpu \
    --devices 8 \
    --precision bf16 \
    --strategy deepspeed_stage_2 \
    --grad_cp 1   \
    --lora_parts=att,ffn,time,ln \
    --lora_r 8 \
    --lora_alpha 16 \
    --lora_dropout 0.01 &

#set the timer for stopping the training job
sleep 900
kill "$!"

if [ $? -eq 1 ]; then
    echo "Training script error, please check CloudWatch logs"
    exit 1
fi

../../s5cmd sync ./*.pth s3://$MODEL_S3_BUCKET/RWKV/output/$(date +%Y-%m-%d-%H-%M-%S)/

Overwriting src/train.sh


In [33]:
## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account, region, repo_name)
image_uri

'427169985960.dkr.ecr.us-east-1.amazonaws.com/sagemaker-rwkv-demo:latest'

**The modified training script**

Everything is ready, let's launch the training job.

## Create SageMaker Training Job

In [None]:
!sudo chmod 777 lost+found

In [35]:
!sudo chmod -R 777 ./src

In [68]:
import time
from sagemaker.estimator import Estimator

environment = {
              'MODEL_S3_BUCKET': sagemaker_default_bucket # The bucket to store pretrained model and fine-tune model
}

base_job_name = 'rwkv-demo'


#instance_type = 'ml.p3dn.24xlarge'
instance_type = 'ml.p4d.24xlarge'
#instance_type = 'ml.g5.12xlarge'

estimator = Estimator(role=role,
                      entry_point='train.sh',
                      source_dir='./src',
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      disable_profiler=True,
                      debugger_hook_config=False,
                      keep_alive_period_in_seconds=3600
                      )

estimator.fit()
# estimator.fit(inputs)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: rwkv-demo-2023-06-16-09-48-25-435


2023-06-16 09:48:31 Starting - Starting the training job...
2023-06-16 09:48:40 Downloading - Downloading input data
[34m== PyTorch ==[0m
[34mNVIDIA Release 23.02 (build 53420872)[0m
[34mPyTorch Version 1.14.0a0+44dac51[0m
[34mContainer image Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.[0m
[34mCopyright (c) 2014-2023 Facebook Inc.[0m
[34mCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)[0m
[34mCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)[0m
[34mCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)[0m
[34mCopyright (c) 2011-2013 NYU                      (Clement Farabet)[0m
[34mCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)[0m
[34mCopyright (c) 2006      Idiap Research Institute (Samy Bengio)[0m
[34mCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)[0m
[34mCopyright (c) 201