From bdc362f27a1f8932a12fe5e16b20f0ff3c20990d Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Mon, 29 Apr 2024 11:31:21 +0000 Subject: [PATCH] Fix memory leaks, model merging on gpus, missing pad tokens --- Dockerfile | 11 +++++------ Dockerfile-notebook | 7 +++---- config-base.yaml | 9 ++++++--- finetune.ipynb | 4 ++-- mlfoundry_utils.py | 2 +- requirements.txt | 5 +++-- train.py | 9 +++++++-- 7 files changed, 27 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index 262a2f9..12e98ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ -# https://hub.docker.com/layers/winglian/axolotl/main-py3.11-cu121-2.1.2/images/sha256-a794e3d8562d3a9a40296726671480c45951cd6e0ad6e8f359e47e75ccbe22ab?context=explore -FROM --platform=linux/amd64 winglian/axolotl@sha256:dc46cae262116297d23f2b445deda3d4b9759b7da5b318315665036a0e2c7140 +# https://hub.docker.com/layers/winglian/axolotl/main-20240423-py3.11-cu121-2.2.1/images/sha256-fc2b9d2b1e46d6b7c47c28a65d2c1d2c3ae4f032fafef27ffaf6ec63bf442f44?context=explore +FROM --platform=linux/amd64 winglian/axolotl@sha256:e0b5b8a94934aaf183932c66ab3ce3ad822e91e19341ade8dbf9eccd9339d799 USER root COPY requirements.txt /tmp/ RUN pip install -U pip wheel setuptools && \ @@ -7,12 +7,11 @@ RUN pip install -U pip wheel setuptools && \ pip install --no-cache-dir -U -r /tmp/requirements.txt RUN mkdir -p /packages && \ cd /packages && \ - git clone https://github.com/OpenAccess-AI-Collective/axolotl && \ + git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout 40a88e8c4a2f32b63df0fe2079f7acfe73329273 + git checkout 7ac62f5fa6b3df526a7d0fed7c711faa20df12b0 RUN cd /packages/axolotl/ && \ - MAX_JOBS=1 NVCC_APPEND_FLAGS="--verbose --threads 1" pip install -v -U --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,fused-dense-lib] && \ - pip uninstall -y mlflow tfy-mlflow-client && \ + MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/requirements.txt && \ rm -rf /root/.cache/pip WORKDIR /app diff --git a/Dockerfile-notebook b/Dockerfile-notebook index 49ce7ef..008c792 100644 --- a/Dockerfile-notebook +++ b/Dockerfile-notebook @@ -19,12 +19,11 @@ RUN mkdir -p /packages && \ chown -R jovyan:users /packages USER jovyan RUN cd /packages && \ - git clone https://github.com/OpenAccess-AI-Collective/axolotl && \ + git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout 40a88e8c4a2f32b63df0fe2079f7acfe73329273 + git checkout 7ac62f5fa6b3df526a7d0fed7c711faa20df12b0 RUN cd /packages/axolotl/ && \ - MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -v -U --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,fused-dense-lib] && \ - pip uninstall -y mlflow tfy-mlflow-client && \ + MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt COPY --chown=jovyan:users . /tmp_home/jovyan/llm-finetune/ ENV JUPYTER_APP_LAUNCHER_PATH=/home/jovyan/llm-finetune/.jp_app_launcher/ diff --git a/config-base.yaml b/config-base.yaml index 753ecb1..9f0538c 100644 --- a/config-base.yaml +++ b/config-base.yaml @@ -54,6 +54,7 @@ dataset_prepared_path: ./outputs/data/last_run_prepared ddp_timeout: 21600 deepspeed: ./deepspeed_configs/3_ds_z2_config.json default_system_message: You are a helpful assistant. Please give a long and detailed answer. +device_map: null early_stopping_patience: 10 eval_sample_packing: False eval_steps: 0.1 @@ -62,7 +63,7 @@ flash_attention: True flash_attn_cross_entropy: True flash_attn_rms_norm: True gradient_accumulation_steps: 4 -gradient_checkpointing: True +gradient_checkpointing: unsloth gradient_checkpointing_kwargs: use_reentrant: True learning_rate: 0.00001 @@ -73,15 +74,16 @@ lora_alpha: 64 lora_dropout: 0.05 lora_on_cpu: False lora_r: 32 -lora_target_modules: null lora_target_linear: True +lora_target_modules: null +low_cpu_mem_usage: True lr_scheduler: cosine max_grad_norm: 1.0 -type_of_model: AutoModelForCausalLM num_epochs: 10 optimizer: adamw_torch_fused output_dir: ./outputs pad_to_sequence_len: True +remove_unused_columns: True report_to: tensorboard resize_token_embeddings_to_32x: False sample_packing: True @@ -94,6 +96,7 @@ strict: False tokenizer_type: AutoTokenizer train_on_inputs: False trust_remote_code: True +type_of_model: AutoModelForCausalLM warmup_ratio: 0.1 weight_decay: 0.01 ## Added by TrueFoundry, not native to Axolotl diff --git a/finetune.ipynb b/finetune.ipynb index f769633..f2bc361 100644 --- a/finetune.ipynb +++ b/finetune.ipynb @@ -28,7 +28,7 @@ "# This should point to your Truefoundry platform endpoint\n", "TRUEFOUNDRY_HOST = os.getenv(\"TFY_HOST\", \"https://.truefoundry.cloud\")\n", "\n", - "import mlfoundry\n", + "from truefoundry import ml as mlfoundry\n", "mlfoundry.login(TRUEFOUNDRY_HOST)" ] }, @@ -435,7 +435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/mlfoundry_utils.py b/mlfoundry_utils.py index 0ab7eaa..88c6f76 100644 --- a/mlfoundry_utils.py +++ b/mlfoundry_utils.py @@ -8,11 +8,11 @@ import string from typing import Any, Dict, Optional -import mlfoundry import numpy as np from huggingface_hub import scan_cache_dir from transformers import TrainerCallback from transformers.integrations import rewrite_logs +from truefoundry import ml as mlfoundry logger = logging.getLogger("axolotl") diff --git a/requirements.txt b/requirements.txt index c109e2f..e4f2a14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu121 -torch==2.1.2+cu121 +torch==2.2.1+cu121 cloud-files==4.15.2 -mlfoundry==0.10.8rc2 +truefoundry[ml]==0.1.2 snowflake-connector-python[pandas]==3.7.0 pyarrow==15.0.0 +deepspeed @ git+https://github.com/truefoundry/DeepSpeed@0866580c316963ddda30ffee44de2c3e21129556 diff --git a/train.py b/train.py index adb9bfa..1a4f34b 100644 --- a/train.py +++ b/train.py @@ -14,6 +14,7 @@ from axolotl.cli.train import do_cli as axolotl_train_cli from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import barrier, is_main_process, zero_first +from axolotl.utils.models import load_tokenizer from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available from checkpoint_utils import cleanup_checkpoints, get_last_checkpoint_for_resume_if_any @@ -83,6 +84,8 @@ def make_axolotl_config(config_base, kwargs, timestamp=None): axolotl_config = os.path.join(cfg.output_dir, "axolotl_config.yaml") if is_main_process(): + set_cfg_option_if_auto(cfg, "tokenizer_config", cfg.base_model_config or cfg.base_model) + os.makedirs(cfg.data_dir, exist_ok=True) os.makedirs(cfg.output_dir, exist_ok=True) @@ -169,8 +172,10 @@ def make_axolotl_config(config_base, kwargs, timestamp=None): # Problem is axolotl tries fixing/adding some tokens by its own. # We don't want to override those decisions without understanding the consequences set_cfg_option_if_auto(cfg, "special_tokens", {}) + tokenizer = load_tokenizer(cfg=cfg) + if not tokenizer.pad_token: + cfg["special_tokens"]["pad_token"] = tokenizer.eos_token set_cfg_option_if_auto(cfg, "lora_modules_to_save", []) - logger.info(f"Prepared config: {cfg}") # This hack is needed because yaml dump refuses to tread DictDefault as dict yaml.add_representer( @@ -207,7 +212,7 @@ def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs): model_dir = cfg.output_dir cleanup_checkpoints(output_dir=cfg.output_dir) if cfg.adapter in {"lora", "qlora"}: - axolotl_merge_lora_cli(config=axolotl_config) + axolotl_merge_lora_cli(config=axolotl_config, deepspeed=None, fsdp=None, device_map="auto") model_dir = os.path.join(model_dir, "merged") model_parent_dir = os.path.dirname(model_dir) # Copy tensorboard logs