From cc7d18eeed18f10a6475617aad47e51202dc75fd Mon Sep 17 00:00:00 2001 From: iMountTai <2506700016@qq.com> Date: Sun, 25 Jun 2023 23:24:16 +0800 Subject: [PATCH 1/2] fix training step --- scripts/training/run_pt.sh | 7 +++---- scripts/training/run_sft.sh | 9 ++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/scripts/training/run_pt.sh b/scripts/training/run_pt.sh index bfd986e..a2fef7e 100644 --- a/scripts/training/run_pt.sh +++ b/scripts/training/run_pt.sh @@ -11,8 +11,7 @@ dataset_dir=path/to/pt/data/dir data_cache=temp_data_cache_dir per_device_train_batch_size=1 per_device_eval_batch_size=1 -training_steps=100 -gradient_accumulation_steps=1 +gradient_accumulation_steps=8 output_dir=output_dir deepspeed_config_file=ds_zero2_no_offload.json @@ -29,7 +28,7 @@ torchrun --nnodes 1 --nproc_per_node 1 run_clm_pt_with_peft.py \ --do_train \ --seed $RANDOM \ --fp16 \ - --max_steps ${training_steps} \ + --num_train_epochs 1 \ --lr_scheduler_type cosine \ --learning_rate ${lr} \ --warmup_ratio 0.05 \ @@ -38,7 +37,7 @@ torchrun --nnodes 1 --nproc_per_node 1 run_clm_pt_with_peft.py \ --logging_steps 10 \ --save_strategy steps \ --save_total_limit 3 \ - --save_steps 500 \ + --save_steps 200 \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --preprocessing_num_workers 8 \ --block_size 512 \ diff --git a/scripts/training/run_sft.sh b/scripts/training/run_sft.sh index 13925b4..df27aef 100644 --- a/scripts/training/run_sft.sh +++ b/scripts/training/run_sft.sh @@ -10,8 +10,7 @@ chinese_tokenizer_path=path/to/chinese/llama/tokenizer/dir dataset_dir=path/to/sft/data/dir per_device_train_batch_size=1 per_device_eval_batch_size=1 -training_steps=100 -gradient_accumulation_steps=1 +gradient_accumulation_steps=8 output_dir=output_dir peft_model=path/to/peft/model/dir validation_file=validation_file_name @@ -30,7 +29,7 @@ torchrun --nnodes 1 --nproc_per_node 1 run_clm_sft_with_peft.py \ --do_eval \ --seed $RANDOM \ --fp16 \ - --max_steps ${training_steps} \ + --num_train_epochs 3 \ --lr_scheduler_type cosine \ --learning_rate ${lr} \ --warmup_ratio 0.03 \ @@ -40,8 +39,8 @@ torchrun --nnodes 1 --nproc_per_node 1 run_clm_sft_with_peft.py \ --save_strategy steps \ --save_total_limit 3 \ --evaluation_strategy steps \ - --eval_steps 250 \ - --save_steps 500 \ + --eval_steps 100 \ + --save_steps 200 \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --preprocessing_num_workers 8 \ --max_seq_length 512 \ From c031387ffc1cf9ca23a0b3be4e91a593c7df7898 Mon Sep 17 00:00:00 2001 From: iMountTai <2506700016@qq.com> Date: Mon, 26 Jun 2023 08:39:12 +0800 Subject: [PATCH 2/2] fix training steps --- scripts/training/run_sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_sft.sh b/scripts/training/run_sft.sh index df27aef..db31a6c 100644 --- a/scripts/training/run_sft.sh +++ b/scripts/training/run_sft.sh @@ -29,7 +29,7 @@ torchrun --nnodes 1 --nproc_per_node 1 run_clm_sft_with_peft.py \ --do_eval \ --seed $RANDOM \ --fp16 \ - --num_train_epochs 3 \ + --num_train_epochs 1 \ --lr_scheduler_type cosine \ --learning_rate ${lr} \ --warmup_ratio 0.03 \