# Using LLama Factory finetune on SageMaker 
# 1. Single GPU QLORA- 本地notebook实例训练

## 安装依赖包

In [None]:
%pip install -Uq sagemaker boto3 datasets

In [None]:
%pip install torch==2.2.0

In [None]:
import os
import glob
import boto3
import pprint
from tqdm import tqdm
import sagemaker
from sagemaker.collection import Collection
from sagemaker.utils import name_from_base

In [None]:
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
sm_client = boto3.client('sagemaker', region_name=region)

## 准备数据集

### 数据集1. 从huggingface上下载ruozhiba数据集
- 改数据集有近5k条数据，本次实验我们可以只用前1k条做训练

In [6]:
from datasets import load_dataset
from random import randrange
dataset_name = "hfl/ruozhiba_gpt4"
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train",revision='41d2c61beb86c8d4c61916cc656c39d018c40ce5')

print(f"Training size: {len(train_dataset)}")
print("\nTraining sample:\n")
print(train_dataset[randrange(len(train_dataset))])

Training size: 4898

Training sample:

{'input': '', 'instruction': '最近怎么了，莫名其妙的想说脏话 可是我他妈是一个哑巴啊！', 'output': '情绪的表达并不总是需要通过语言，即使是哑巴，也有可能感受到想要表达强烈情绪或压力的冲动。这种想说脏话的冲动可能是由于内部情绪压力或者对某些情况的强烈反应。即便不能通过说话来表达，可能会通过手势、面部表情或其他非语言方式来表达自己的情绪。建议寻找适合自己的方式来管理和表达这些情绪，比如使用写作、艺术创作或与理解你的朋友通过其他方式沟通。'}


### 数据集2. 身份数据集
```json
[{'instruction': 'hi',
  'input': '',
  'output': 'Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'},
 {'instruction': 'hello',
  'input': '',
  'output': 'Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'},
 {'instruction': 'Who are you?',
  'input': '',
  'output': 'I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'}]
```
把其中的name和author替换成您自己想替换的值，这样微调完成之后，问模型“你是谁，谁创造的你？”这类的身份问题，模型就会按这个新的值来回答

In [7]:
def format_identity(origin_obj,name,author):
    ret = []
    for ele in origin_obj:
        ele['output'] = ele['output'].replace("{{name}}",name).replace("{{author}}",author)
        ret.append(ele)
    return ret

- 替换成您自己的设定

In [8]:
NAME = 'NANE'
AUTHOR = 'CK'

In [10]:
pwd

'/home/ec2-user/SageMaker/finetuning-on-aws/finetuning-llamafactory'

In [13]:
!pwd
%cd ~/SageMaker/finetuning-on-aws/finetuning-llamafactory

/home/ec2-user/SageMaker/finetuning-on-aws/finetuning-llamafactory
/home/ec2-user/SageMaker/finetuning-on-aws/finetuning-llamafactory


In [14]:
!pwd

/home/ec2-user/SageMaker/finetuning-on-aws/finetuning-llamafactory


In [15]:
import json
file_name = './LLaMA-Factory/data/identity.json'
with open(file_name) as f:
    identity = json.load(f)
identity_2 = format_identity(identity,name=NAME,author=AUTHOR)
identity_2[:2]

[{'instruction': 'hi',
  'input': '',
  'output': 'Hello! I am NANE, an AI assistant developed by CK. How can I assist you today?'},
 {'instruction': 'hello',
  'input': '',
  'output': 'Hello! I am NANE, an AI assistant developed by CK. How can I assist you today?'}]

In [20]:
os.makedirs('./train',exist_ok=True)
with open('./train/identity_2.json','w') as f:
    json.dump(identity_2,f)

### 把数据copy至S3

In [21]:
s3_data_uri = f"s3://{default_bucket}/dataset-for-training"
training_input_path = f'{s3_data_uri}/train'

In [22]:
print(training_input_path)

s3://sagemaker-us-west-2-342367142984/dataset-for-training/train


In [23]:
# save train_dataset to s3
train_dataset.to_json('./train/ruozhiba.json')
sagemaker.s3.S3Uploader.upload(local_path="./train/ruozhiba.json", desired_s3_uri=training_input_path, sagemaker_session=sagemaker_session)
sagemaker.s3.S3Uploader.upload(local_path="./train/identity_2.json", desired_s3_uri=training_input_path, sagemaker_session=sagemaker_session)

print(f"saving training dataset to: {training_input_path}")


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

saving training dataset to: s3://sagemaker-us-west-2-342367142984/dataset-for-training/train


## 准备LLaMA-Factory 的 dataset info

In [24]:
import json

In [25]:
file_name = './LLaMA-Factory/data/dataset_info.json'
with open(file_name) as f:
    datainfo = json.load(f)

In [26]:
datainfo['identity']={'file_name': 'identity_2.json'}

In [27]:
datainfo['ruozhiba']={
    'file_name':'ruozhiba.json',
    "columns": {
    "prompt": "instruction",
    "query": "input",
    "response": "output",
  }      
}

In [28]:
with open('./LLaMA-Factory/data/dataset_info.json','w') as f:
    json.dump(fp=f,obj=datainfo)

## 准备LLaMA-Factory 的 训练配置yaml文件
###  从LLaMA-Factory/examples/train_qlora/目录中复制出llama3_lora_sft_awq.yaml，并修改

In [29]:
#load template
import yaml
file_name = './LLaMA-Factory/examples/train_qlora/llama3_lora_sft_awq.yaml'
with open(file_name) as f:
    doc = yaml.safe_load(f)
doc

{'model_name_or_path': 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ',
 'stage': 'sft',
 'do_train': True,
 'finetuning_type': 'lora',
 'lora_target': 'all',
 'dataset': 'identity,alpaca_en_demo',
 'template': 'llama3',
 'cutoff_len': 1024,
 'max_samples': 1000,
 'overwrite_cache': True,
 'preprocessing_num_workers': 16,
 'output_dir': 'saves/llama3-8b/lora/sft',
 'logging_steps': 10,
 'save_steps': 500,
 'plot_loss': True,
 'overwrite_output_dir': True,
 'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 8,
 'learning_rate': 0.0001,
 'num_train_epochs': 3.0,
 'lr_scheduler_type': 'cosine',
 'warmup_ratio': 0.1,
 'bf16': True,
 'ddp_timeout': 180000000,
 'val_size': 0.1,
 'per_device_eval_batch_size': 1,
 'eval_strategy': 'steps',
 'eval_steps': 500}

In [30]:
#设置模型的保存目录在本notebook实例本地
save_dir = '/home/ec2-user/SageMaker/Easy_Fintune_LLM_using_SageMaker_with_LLama_Factory/finetuned_model'
# doc['output_dir'] = save_dir

# 如果是用SageMaker则使用以下模型文件路径
doc['output_dir'] ='/tmp/finetuned_model'
doc['per_device_train_batch_size'] =1
doc['gradient_accumulation_steps'] =8
# doc['lora_target'] = 'all'
doc['cutoff_len'] = 2048
doc['num_train_epochs'] = 5.0
doc['warmup_steps'] = 10

#实验时间，只选取前200条数据做训练
doc['max_samples'] = 200 
#数据集
doc['dataset'] = 'identity,ruozhiba'

### 保存为训练配置文件

In [31]:
sg_config = 'sg_config_qlora.yaml'
with open(f'./LLaMA-Factory/{sg_config}', 'w') as f:
    yaml.safe_dump(doc, f)
doc

{'model_name_or_path': 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ',
 'stage': 'sft',
 'do_train': True,
 'finetuning_type': 'lora',
 'lora_target': 'all',
 'dataset': 'identity,ruozhiba',
 'template': 'llama3',
 'cutoff_len': 2048,
 'max_samples': 200,
 'overwrite_cache': True,
 'preprocessing_num_workers': 16,
 'output_dir': '/tmp/finetuned_model',
 'logging_steps': 10,
 'save_steps': 500,
 'plot_loss': True,
 'overwrite_output_dir': True,
 'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 8,
 'learning_rate': 0.0001,
 'num_train_epochs': 5.0,
 'lr_scheduler_type': 'cosine',
 'warmup_ratio': 0.1,
 'bf16': True,
 'ddp_timeout': 180000000,
 'val_size': 0.1,
 'per_device_eval_batch_size': 1,
 'eval_strategy': 'steps',
 'eval_steps': 500,
 'warmup_steps': 10}

## 本地GPU测试提交 Training job

### 由于我们的实验环境限制，无法提交Training Job，所以在本次实验是在notebook实例中进行训练
### 如果您在自己的AWS环境中，且有SageMaker Training Job 所需GPU实例的quota，则可以用如下代码提交，instance_type改成'ml.g5.2xlarge' 

```python
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime

instance_count = 1
instance_type = 'local_gpu' 
max_time = 3600*24

# Get the current time
current_time = datetime.now()

# wandb.sagemaker_auth(path="./")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'llama3-8b-qlora-finetune'
environment = {
    'NODE_NUMBER':str(instance_count),
    "s3_data_paths":f"{training_input_path}",
    "sg_config":sg_config,
    'OUTPUT_MODEL_S3_PATH': f's3://{default_bucket}/llama3-8b-qlora/', # destination
}

estimator = PyTorch(entry_point='entry_single_lora.py',
                            source_dir='./LLaMA-Factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.2.0',
                            py_version='py310',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            enable_remote_debug=True,
                            # keep_alive_period_in_seconds=600,
                            max_run=max_time)

estimator.fit()

```


In [32]:
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime

instance_count = 1

#使用本地机器，也可以指定为 ml.g5.2xlarge等其他实例
instance_type = 'local_gpu' 
max_time = 3600*24

# Get the current time
current_time = datetime.now()

# wandb.sagemaker_auth(path="./")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'llama3-8b-qlora-finetune'
environment = {
    'NODE_NUMBER':str(instance_count),
    "s3_data_paths":f"{training_input_path}",
    "sg_config":sg_config,
    'OUTPUT_MODEL_S3_PATH': f's3://{default_bucket}/llama3-8b-qlora/', # destination
}

estimator = PyTorch(entry_point='entry_single_lora.py',
                            source_dir='./LLaMA-Factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.2.0',
                            py_version='py310',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            enable_remote_debug=True,
                            # keep_alive_period_in_seconds=600,
                            max_run=max_time)

20240829190926


- 开始训练

In [33]:
estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llama3-8b-qlora-finetune-2024-08-29-19-09-43-243
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.local.image:'Docker Compose' is not installed. Proceeding to check for 'docker-compose' CLI.
INFO:sagemaker.local.image:'Docker Compose' found using Docker Compose CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.imag

Login Succeeded


INFO:sagemaker.local.image:image pulled: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.2.0-gpu-py310
INFO:sagemaker.local.image:docker command: docker-compose -f /tmp/tmpq2u9bdu0/docker-compose.yaml up --build --abort-on-container-exit


 Container kqh8jyxm6x-algo-1-ci1oq  Creating
 Container kqh8jyxm6x-algo-1-ci1oq  Created
Attaching to kqh8jyxm6x-algo-1-ci1oq
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:39,776 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:39,797 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:39,806 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:39,809 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:39,812 sagemaker_pytorch_container.training INFO     Invoking user training script.
kqh8jyxm6x-algo-1-ci1oq  | 2024-08-29 19:13:40,934 botocore.credentials INFO     Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
kqh8jyxm6x-algo-1-ci1oq  | 2024-08

INFO:root:creating /tmp/tmpq2u9bdu0/artifacts/output/data
INFO:root:copying /tmp/tmpq2u9bdu0/algo-1-ci1oq/output/success -> /tmp/tmpq2u9bdu0/artifacts/output
INFO:sagemaker.local.image:===== Job Complete =====


kqh8jyxm6x-algo-1-ci1oq exited with code 0
Aborting on container exit...
 Container kqh8jyxm6x-algo-1-ci1oq  Stopping
 Container kqh8jyxm6x-algo-1-ci1oq  Stopped


In [36]:
environment['OUTPUT_MODEL_S3_PATH']

's3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/'

## 至此部，本章节结束
- 模型已经在本地训练完成，并上传至s3 位置在 : s3://{default_bucket}/llama3-8b-qlora/