Skip to content

Commit

Permalink
Update axolotl for newer architectures
Browse files Browse the repository at this point in the history
* Update requirements, cleanup README

* Add unsloth options to config

* wip - try chat template

* Update requirements and try unsloth

* disable unsloth, it does not play well with deepspeed

* Cleanup unnecessary code

* Remove import

* Add chat template selection logic

* more fixes

* Revert truefoundry lib to stable version

* Update config, readme, sample run
  • Loading branch information
chiragjn committed Jun 7, 2024
1 parent 028db8e commit 91f7fbe
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 387 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://hub.docker.com/layers/winglian/axolotl/main-20240423-py3.11-cu121-2.2.1/images/sha256-fc2b9d2b1e46d6b7c47c28a65d2c1d2c3ae4f032fafef27ffaf6ec63bf442f44?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:e0b5b8a94934aaf183932c66ab3ce3ad822e91e19341ade8dbf9eccd9339d799
# https://hub.docker.com/layers/winglian/axolotl/main-20240603-py3.11-cu121-2.3.0/images/sha256-e4b898a0f700eb86f9e802bb85c1ec6c509b2dec65d941ad43405fe323865017?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:a66d1469cdad472779f6419ea67d0fbb2cce984244aa86f40c99abaa4a21b3db
USER root
COPY requirements.txt /tmp/
RUN pip install -U pip wheel setuptools && \
Expand All @@ -9,7 +9,7 @@ RUN mkdir -p /packages && \
cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 4e8264e937571c53b9dc75345a14d4b9b9d68c4f
git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/requirements.txt && \
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile-notebook
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM truefoundrycloud/jupyter:0.2.17-sudo
FROM truefoundrycloud/jupyter:0.2.19-sudo
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ENV DEBIAN_FRONTEND=noninteractive
USER root
Expand All @@ -21,7 +21,7 @@ USER jovyan
RUN cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 4e8264e937571c53b9dc75345a14d4b9b9d68c4f
git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
Expand Down
297 changes: 9 additions & 288 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,301 +1,22 @@
Axolotl config options

<details>
<summary>Click to expand all axolotl options</summary>

Just dumping here, because some options are not documented

```
cfg.adam_beta1
cfg.adam_beta2
cfg.adam_epsilon
cfg.adapter
cfg.auto_resume_from_checkpoints
cfg.axolotl_config_path
cfg.base_model
cfg.base_model_config
cfg.batch_size
cfg.bench_dataset
cfg.bf16
cfg.bfloat16
cfg.bnb_config_kwargs
cfg.chat_template
cfg.conversation
cfg.cosine_min_lr_ratio
cfg.dataloader_drop_last
cfg.dataloader_num_workers
cfg.dataloader_pin_memory
cfg.dataloader_prefetch_factor
cfg.dataset_keep_in_memory
cfg.dataset_prepared_path
cfg.dataset_processes
cfg.dataset_shard_idx
cfg.dataset_shard_num
cfg.datasets
cfg.ddp
cfg.ddp_broadcast_buffers
cfg.ddp_bucket_cap_mb
cfg.ddp_timeout
cfg.debug
cfg.deepspeed
cfg.default_system_message
cfg.device
cfg.device_map
cfg.do_bench_eval
cfg.dpo_beta
cfg.dpo_label_smoothing
cfg.eager_attention
cfg.early_stopping_patience
cfg.eval_batch_size
cfg.eval_sample_packing
cfg.eval_steps
cfg.eval_table_max_new_tokens
cfg.eval_table_size
cfg.evals_per_epoch
cfg.evaluation_strategy
cfg.field_input
cfg.field_instruction
cfg.field_output
cfg.field_system
cfg.flash_attention
cfg.flash_attn_cross_entropy
cfg.flash_attn_fuse_mlp
cfg.flash_attn_fuse_qkv
cfg.flash_attn_rms_norm
cfg.flash_optimum
cfg.float16
cfg.format
cfg.fp16
cfg.fsdp
cfg.fsdp_config
cfg.gptq
cfg.gptq_disable_exllama
cfg.gpu_memory_limit
cfg.gradient_accumulation_steps
cfg.gradient_checkpointing
cfg.gradient_checkpointing_kwargs
cfg.greater_is_better
cfg.group_by_length
cfg.hf_use_auth_token
cfg.hub_model_id
cfg.hub_strategy
cfg.is_falcon_derived_model
cfg.is_file
cfg.is_llama_derived_model
cfg.is_mistral_derived_model
cfg.is_preprocess
cfg.is_qwen_derived_model
cfg.learning_rate
cfg.load_best_model_at_end
cfg.load_in_4bit
cfg.load_in_8bit
cfg.local_rank
cfg.logging_steps
cfg.lora_alpha
cfg.lora_dropout
cfg.lora_fan_in_fan_out
cfg.lora_model_dir
cfg.lora_modules_to_save
cfg.lora_on_cpu
cfg.lora_r
cfg.lora_target_linear
cfg.lora_target_modules
cfg.loss_watchdog_patience
cfg.loss_watchdog_threshold
cfg.lr_quadratic_warmup
cfg.lr_scheduler
cfg.lr_scheduler_kwargs
cfg.max_grad_norm
cfg.max_memory
cfg.max_packed_sequence_len
cfg.max_steps
cfg.merge_lora
cfg.metric_for_best_model
cfg.micro_batch_size
cfg.mlflow_experiment_name
cfg.model_config
cfg.model_config_type
cfg.model_kwargs
cfg.model_revision
cfg.model_type
cfg.neftune_noise_alpha
cfg.no_input_format
cfg.noisy_embedding_alpha
cfg.num_epochs
cfg.optimizer
cfg.output_dir
cfg.pad_to_sequence_len
cfg.path
cfg.peft
cfg.peft_adapter
cfg.peft_layers_to_transform
cfg.precompute_ref_log_probs
cfg.pretraining_dataset
cfg.push_dataset_to_hub
cfg.push_to_hub_model_id
cfg.read_text
cfg.relora_cpu_offload
cfg.relora_steps
cfg.relora_warmup_steps
cfg.remove_unused_columns
cfg.resize_token_embeddings_to_32x
cfg.resume_from_checkpoint
cfg.rl
cfg.rl_adapter_ref_model
cfg.rope_scaling
cfg.s2_attention
cfg.sample_packing
cfg.sample_packing_eff_est
cfg.save_safetensors
cfg.save_steps
cfg.save_strategy
cfg.save_total_limit
cfg.saves_per_epoch
cfg.sdp_attention
cfg.seed
cfg.sequence_len
cfg.special_tokens
cfg.strict
cfg.system_format
cfg.system_prompt
cfg.test_datasets
cfg.tf32
cfg.tokenizer_config
cfg.tokenizer_legacy
cfg.tokenizer_type
cfg.tokenizer_use_fast
cfg.tokens
cfg.torch_compile
cfg.torch_compile_backend
cfg.torch_dtype
cfg.torchdistx_path
cfg.total_num_tokens
cfg.total_supervised_tokens
cfg.train_on_inputs
cfg.trust_remote_code
cfg.type
cfg.unfrozen_parameters
cfg.use_mlflow
cfg.use_wandb
cfg.val_set_size
cfg.wandb_name
cfg.wandb_project
cfg.wandb_run_id
cfg.warmup_ratio
cfg.warmup_steps
cfg.weight_decay
cfg.world_size
cfg.xformers_attention
cfg.zero_optimization
```
</details>
> [!important]
> Please prefer using commits from [release tags](https://github.com/truefoundry/llm-finetune/releases). `main` branch is work in progress and may have partially working commits.
## LLM Finetuning with Truefoundry

Test QLoRA w/ Deepspeed Stage 2

```
#!/bin/bash
# export CUDA_LAUNCH_BLOCKING=1
# export NCCL_DEBUG=INFO
# export TORCH_PER_PROCESS_MEMORY_LIMIT=22000
export CUDA_VISIBLE_DEVICES=0
export DISABLE_MLFLOW_INTEGRATION=True
TRAIN_BATCH_SIZE=1
GRADIENT_ACCUMULATION_STEPS=4
LORA_R=32
LORA_ALPHA=64
TORCH_PER_PROCESS_MEMORY_LIMIT=0.95
CUDA_VISIBLE_DEVICES=0,1
TRAIN_DATA="./data/standford_alpaca_train_49k.jsonl"
# TRAIN_DATA="./data/lima_llama2_1k.jsonl"
MAX_STEPS=10
# MODEL_ID=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
# MODEL_ID=cognitivecomputations/Wizard-Vicuna-30B-Uncensored
# MODEL_ID=EleutherAI/pythia-70m
MODEL_ID=NousResearch/Llama-2-7b-chat-hf
# MODEL_ID=NousResearch/Llama-2-13b-chat-hf
# MODEL_ID=mistralai/Mistral-7B-Instruct-v0.2
# MODEL_ID=NousResearch/Llama-2-70b-chat-hf
# MODEL_ID=mistralai/Mixtral-8x7B-Instruct-v0.1
# MODEL_ID=stas/tiny-random-llama-2
# MODEL_ID=microsoft/phi-1_5
# MODEL_ID=microsoft/phi-2
# MODEL_ID=Deci/DeciLM-7B
USE_FLASH_ATTENTION=True
GRADIENT_CHECKPOINTING=True
NUM_TRAIN_EPOCHS=3
# --deepspeed ./deepspeed_configs/3_ds_z2_config.json \
# --deepspeed ./deepspeed_configs/4_ds_z2_offload_optimizer_config.json \
# --deepspeed ./deepspeed_configs/5_ds_z3_config.json \
# --deepspeed ./deepspeed_configs/6_ds_z3_offload_param_config.json \
# --deepspeed ./deepspeed_configs/7_ds_z3_offload_optimizer_config.json \
# --deepspeed ./deepspeed_configs/8_ds_z3_offload_param_offload_optimizer_config.json \
accelerate launch \
--mixed_precision bf16 \
--use_deepspeed \
train.py \
config-base.yaml \
--deepspeed ./deepspeed_configs/3_ds_z2_config.json \
--flash_attention $USE_FLASH_ATTENTION \
--base_model $MODEL_ID \
--train_data_uri $TRAIN_DATA \
--max_steps $MAX_STEPS \
--val_data_uri None \
--val_set_size 0.1 \
--micro_batch_size $TRAIN_BATCH_SIZE \
--num_epochs $NUM_TRAIN_EPOCHS \
--gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
--gradient_checkpointing $GRADIENT_CHECKPOINTING \
--learning_rate 0.00001 \
--output_dir ./outputs \
--train_on_inputs False \
--logging_steps 1 \
--save_strategy steps \
--save_steps 0.05 \
--evaluation_strategy steps \
--eval_steps 0.05 \
--adapter qlora \
--lora_target_linear True \
--lora_r $LORA_R \
--lora_alpha $LORA_ALPHA \
--mlfoundry_enable_reporting False \
--mlfoundry_ml_repo my-ml-repo \
--mlfoundry_run_name test \
--mlfoundry_checkpoint_artifact_name chk-test \
--mlfoundry_log_checkpoints False \
--resume_from_checkpoint False \
--cleanup_output_dir_on_start True
./sample_run.sh
```

---

- `TORCH_PER_PROCESS_MEMORY_LIMIT` allows limiting the max memory per gpu. Can be a fraction (denoting percentage) or integer (denoting limit in MiB). Useful for testing limited gpu memory scenarios
- CUDA_VISIBLE_DEVICES can be used to control the amount of GPUs
- `--mlfoundry_enable_reporting true/false` toggles reporting metrics, checkpoints and models to mlfoundry
- When you are testing locally, you can set `--cleanup_output_dir_on_start true` if you don't care about checkpoints between runs
TODO:

- [ ] Setup C/I Tests
- [ ] Track and publish VRAM and Speed benchmarks for popular models and GPUs

---

Generally we always try to optimize for memory footprint because that allows higher batch size and more gpu utilization
Speedup is second priority but we take what we can easily get

#### Experimental things we want to try

- Memory Savings Optimizers
- AnyPrecision Adam: `--optim adamw_anyprecision --optim-args "use_kahan_summation=True, momentum_dtype=bfloat16, variance_dtype=bfloat16"`
- 8-bit Adam: `--optim adamw_bnb_8bit`
- Zero's BF16 optimizer
- torch.compile -> Works in some cases, can speedup training
- Zero++ quantized weights and gradients for faster comm
- Long context
- Sequence Parallelism w/ Deepspeed Ulysses
- LongLora with SSA
- Tricks mentioned in Meta: Effective Long-Context Scaling of Foundation Model
- Quantized Activations? - FP8 training is already a thing
- https://github.com/kaiokendev/alpaca_lora_4bit
- DP + TP + PP aka Megatron
- Difficult to configure, Megatron-Deepspeed provides lower throughput but easier to work with
8 changes: 7 additions & 1 deletion config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ mlfoundry_ml_repo: null
# ---------------------
# Auto computed and set by script based on environment and external state
# Only edit them if you know what you are doing
chat_template: auto # type: string
data_dir: auto # type: string
datasets: auto # type: list
test_datasets: auto # type: list
Expand All @@ -29,6 +30,10 @@ load_in_4bit: auto # type: bool
lora_modules_to_save: auto # type: list
resume_from_checkpoint: auto # type: bool
special_tokens: auto # type: dict
unsloth_cross_entropy_loss: auto # type: bool
unsloth_lora_mlp: auto # type: bool
unsloth_lora_qkv: auto # type: bool
unsloth_lora_o: auto # type: bool
tf32: auto # type: bool
## Added by TrueFoundry, not native to Axolotl
mlfoundry_run_name: auto # type: string
Expand All @@ -49,8 +54,8 @@ base_model_ignore_patterns:
- '*.ot'
- '*.tflite'
- '*.msgpack'
chat_template: chatml
dataset_prepared_path: ./outputs/data/last_run_prepared
dataset_processes: 1
ddp_timeout: 21600
deepspeed: ./deepspeed_configs/3_ds_z2_config.json
default_system_message: You are a helpful assistant. Please give a long and detailed answer.
Expand Down Expand Up @@ -105,3 +110,4 @@ logging_dir: ./tensorboard_logs
mlfoundry_log_checkpoints: True
use_mflow: False
use_wandb: False
use_tensorboard: True
Loading

0 comments on commit 91f7fbe

Please sign in to comment.