## SageMaker managed training 테스트

- 먼저 모델과 데이터를 s3로 업로드 합니다.
- HF transformers의 Trainer 는 HF model hub에서 `model_id`를 통해 모델을 직접 받아서 학습하는 것도 문제 없지만, s3에 올려두고 이를 활용하는 것이 더 효율적입니다.


In [None]:
%store -r

In [None]:
# model_download_path = "/home/ec2-user/SageMaker/models/LDCC-SOLAR-10-7B/models--LDCC--LDCC-SOLAR-10.7B/snapshots/1055563879363d9ee2fba1d9fd1628eca6bcbb4e"
print(model_download_path)

In [None]:
import time
import sagemaker
import boto3
from sagemaker.utils import name_from_base
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from datasets import load_dataset, load_from_disk

sess = sagemaker.Session()
region = boto3.Session().region_name
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

In [None]:
# %%bash
# aws configure set default.s3.max_concurrent_requests 100
# aws configure set default.s3.max_queue_size 10000
# aws configure set default.s3.multipart_threshold 1GB
# aws configure set default.s3.multipart_chunksize 64MB

In [None]:
base_s3_path = f"llm/med-finetune"
model_s3_path = f"{base_s3_path}/models"
s3_model_artifact = sess.upload_data(path=model_download_path, key_prefix=model_s3_path)

In [None]:
# s3_model_artifact = "s3://sagemaker-us-west-2-723597067299/llm/med-fientune/models"
print(s3_model_artifact)

In [None]:
data_s3_path = f"{base_s3_path}/datasets"
s3_data_artifact = sess.upload_data(path="./dataset", key_prefix=data_s3_path)

In [None]:
# s3_data_artifact = "s3://sagemaker-us-west-2-723597067299/llm/med-fientune/datasets"
print(s3_data_artifact)

In [None]:
instance_type = "ml.g5.4xlarge"  # model merge 등을 사용한다면 일정 이상 RAM이 필요합니다.

if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sm_session = LocalSession()
    sm_session.config = {'local': {'local_code': True}}
else:
    sm_session = sagemaker.session.Session()
    
print(f"instance type : {instance_type}")

In [None]:

# Define Training Job Name 
job_name = "ko-medllm-finetune"
#checkpoint_s3_uri = f"s3://{bucket}/llm/med-finetune/{job_name}/checkpoints"

# See https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-sagemaker'
hparams = {}

max_run = 60 * 60 * 12 # 12 hrs
   
use_spot_instances = False
if use_spot_instances:
    max_wait = 12*60*60 # 12 hours: spot instance waiting + max runtime
else:
    max_wait = None

print(f"image uri : {image_uri}")

In [None]:

# Create the Estimator
estimator = PyTorch(
    image_uri=image_uri,
    entry_point='run.sh',        # train script
    source_dir='src',               # directory which includes all the files needed for training
    instance_type=instance_type,    # instances type used for the training job
    instance_count=1,               # the number of instances used for training
    base_job_name=job_name,         # the name of the training job
    role=role,                      # Iam role used in training job to access AWS ressources, e.g. S3
    sagemaker_session=sm_session,   # sagemaker session
    volume_size=300,                # the size of the EBS volume in GB
    hyperparameters=hparams,
    debugger_hook_config=False,
    disable_profile=True,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait if use_spot_instances else None,
    # checkpoint_s3_uri=checkpoint_s3_uri if instance_type not in ['local', 'local_gpu'] else None,
    # checkpoint_local_path='/opt/ml/checkpoints' if instance_type not in ['local', 'local_gpu'] else None,
)

In [None]:
LOCAL_MODE = False

if LOCAL_MODE:
    estimator.fit(
        {
            "pretrained": f'file://../../models/{model_name}', # local mode 사용시 적절하게 변경필요
            "training": f'file://./dataset'
        },
        wait=False
    )
else:
    fast_file = lambda x: TrainingInput(x, input_mode="FastFile")
    estimator.fit(
        {
            "pretrained": fast_file(s3_model_artifact),
            "training": fast_file(s3_data_artifact),
        },
        wait=False
    )

    from IPython.display import display, HTML

    def make_console_link(region, train_job_name, train_task='[Training]'):
        train_job_link = f'<b> {train_task} Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{train_job_name}">Training Job</a></b>'   
        cloudwatch_link = f'<b> {train_task} Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={region}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={train_job_name};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a></b>'
        return train_job_link, cloudwatch_link  

    train_job_name = estimator.latest_training_job.job_name
    train_job_link, cloudwatch_link = make_console_link(region, train_job_name, '[Fine-tuning]')

    display(HTML(train_job_link))
    display(HTML(cloudwatch_link))

In [None]:
print(f"Job name: {train_job_name}")


In [None]:
estimator.logs()

In [None]:
finetuned_model_s3_uri = estimator.model_data

In [None]:
import json, os

local_model_dir = 'model_from_sagemaker'

if not os.path.exists(local_model_dir):
    os.makedirs(local_model_dir)

!aws s3 cp {finetuned_model_s3_uri} {local_model_dir}/model.tar.gz
!tar -xzf {local_model_dir}/model.tar.gz -C {local_model_dir}
!rm {local_model_dir}/model.tar.gz

In [None]:
finetuned_model_s3_path = f"{base_s3_path}/finetuned-model"
finetuned_model_uri = sess.upload_data(path=local_model_dir, key_prefix=finetuned_model_s3_path)

In [None]:
print(finetuned_model_uri)

In [None]:
%store finetuned_model_uri