Skip to content

Commit

Permalink
Fix memory leaks, model merging on gpus, missing pad tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
chiragjn committed Apr 29, 2024
1 parent 367620f commit bdc362f
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 20 deletions.
11 changes: 5 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# https://hub.docker.com/layers/winglian/axolotl/main-py3.11-cu121-2.1.2/images/sha256-a794e3d8562d3a9a40296726671480c45951cd6e0ad6e8f359e47e75ccbe22ab?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:dc46cae262116297d23f2b445deda3d4b9759b7da5b318315665036a0e2c7140
# https://hub.docker.com/layers/winglian/axolotl/main-20240423-py3.11-cu121-2.2.1/images/sha256-fc2b9d2b1e46d6b7c47c28a65d2c1d2c3ae4f032fafef27ffaf6ec63bf442f44?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:e0b5b8a94934aaf183932c66ab3ce3ad822e91e19341ade8dbf9eccd9339d799
USER root
COPY requirements.txt /tmp/
RUN pip install -U pip wheel setuptools && \
pip uninstall -y mlflow axolotl && \
pip install --no-cache-dir -U -r /tmp/requirements.txt
RUN mkdir -p /packages && \
cd /packages && \
git clone https://github.com/OpenAccess-AI-Collective/axolotl && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 40a88e8c4a2f32b63df0fe2079f7acfe73329273
git checkout 7ac62f5fa6b3df526a7d0fed7c711faa20df12b0
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--verbose --threads 1" pip install -v -U --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,fused-dense-lib] && \
pip uninstall -y mlflow tfy-mlflow-client && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/requirements.txt && \
rm -rf /root/.cache/pip
WORKDIR /app
Expand Down
7 changes: 3 additions & 4 deletions Dockerfile-notebook
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@ RUN mkdir -p /packages && \
chown -R jovyan:users /packages
USER jovyan
RUN cd /packages && \
git clone https://github.com/OpenAccess-AI-Collective/axolotl && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 40a88e8c4a2f32b63df0fe2079f7acfe73329273
git checkout 7ac62f5fa6b3df526a7d0fed7c711faa20df12b0
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -v -U --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,fused-dense-lib] && \
pip uninstall -y mlflow tfy-mlflow-client && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
COPY --chown=jovyan:users . /tmp_home/jovyan/llm-finetune/
ENV JUPYTER_APP_LAUNCHER_PATH=/home/jovyan/llm-finetune/.jp_app_launcher/
9 changes: 6 additions & 3 deletions config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dataset_prepared_path: ./outputs/data/last_run_prepared
ddp_timeout: 21600
deepspeed: ./deepspeed_configs/3_ds_z2_config.json
default_system_message: You are a helpful assistant. Please give a long and detailed answer.
device_map: null
early_stopping_patience: 10
eval_sample_packing: False
eval_steps: 0.1
Expand All @@ -62,7 +63,7 @@ flash_attention: True
flash_attn_cross_entropy: True
flash_attn_rms_norm: True
gradient_accumulation_steps: 4
gradient_checkpointing: True
gradient_checkpointing: unsloth
gradient_checkpointing_kwargs:
use_reentrant: True
learning_rate: 0.00001
Expand All @@ -73,15 +74,16 @@ lora_alpha: 64
lora_dropout: 0.05
lora_on_cpu: False
lora_r: 32
lora_target_modules: null
lora_target_linear: True
lora_target_modules: null
low_cpu_mem_usage: True
lr_scheduler: cosine
max_grad_norm: 1.0
type_of_model: AutoModelForCausalLM
num_epochs: 10
optimizer: adamw_torch_fused
output_dir: ./outputs
pad_to_sequence_len: True
remove_unused_columns: True
report_to: tensorboard
resize_token_embeddings_to_32x: False
sample_packing: True
Expand All @@ -94,6 +96,7 @@ strict: False
tokenizer_type: AutoTokenizer
train_on_inputs: False
trust_remote_code: True
type_of_model: AutoModelForCausalLM
warmup_ratio: 0.1
weight_decay: 0.01
## Added by TrueFoundry, not native to Axolotl
Expand Down
4 changes: 2 additions & 2 deletions finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"# This should point to your Truefoundry platform endpoint\n",
"TRUEFOUNDRY_HOST = os.getenv(\"TFY_HOST\", \"https://<your-org>.truefoundry.cloud\")\n",
"\n",
"import mlfoundry\n",
"from truefoundry import ml as mlfoundry\n",
"mlfoundry.login(TRUEFOUNDRY_HOST)"
]
},
Expand Down Expand Up @@ -435,7 +435,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion mlfoundry_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import string
from typing import Any, Dict, Optional

import mlfoundry
import numpy as np
from huggingface_hub import scan_cache_dir
from transformers import TrainerCallback
from transformers.integrations import rewrite_logs
from truefoundry import ml as mlfoundry

logger = logging.getLogger("axolotl")

Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/cu121
torch==2.1.2+cu121
torch==2.2.1+cu121
cloud-files==4.15.2
mlfoundry==0.10.8rc2
truefoundry[ml]==0.1.2
snowflake-connector-python[pandas]==3.7.0
pyarrow==15.0.0
deepspeed @ git+https://github.com/truefoundry/DeepSpeed@0866580c316963ddda30ffee44de2c3e21129556
9 changes: 7 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from axolotl.cli.train import do_cli as axolotl_train_cli
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import barrier, is_main_process, zero_first
from axolotl.utils.models import load_tokenizer
from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available

from checkpoint_utils import cleanup_checkpoints, get_last_checkpoint_for_resume_if_any
Expand Down Expand Up @@ -83,6 +84,8 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
axolotl_config = os.path.join(cfg.output_dir, "axolotl_config.yaml")

if is_main_process():
set_cfg_option_if_auto(cfg, "tokenizer_config", cfg.base_model_config or cfg.base_model)

os.makedirs(cfg.data_dir, exist_ok=True)
os.makedirs(cfg.output_dir, exist_ok=True)

Expand Down Expand Up @@ -169,8 +172,10 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
# Problem is axolotl tries fixing/adding some tokens by its own.
# We don't want to override those decisions without understanding the consequences
set_cfg_option_if_auto(cfg, "special_tokens", {})
tokenizer = load_tokenizer(cfg=cfg)
if not tokenizer.pad_token:
cfg["special_tokens"]["pad_token"] = tokenizer.eos_token
set_cfg_option_if_auto(cfg, "lora_modules_to_save", [])

logger.info(f"Prepared config: {cfg}")
# This hack is needed because yaml dump refuses to tread DictDefault as dict
yaml.add_representer(
Expand Down Expand Up @@ -207,7 +212,7 @@ def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
model_dir = cfg.output_dir
cleanup_checkpoints(output_dir=cfg.output_dir)
if cfg.adapter in {"lora", "qlora"}:
axolotl_merge_lora_cli(config=axolotl_config)
axolotl_merge_lora_cli(config=axolotl_config, deepspeed=None, fsdp=None, device_map="auto")
model_dir = os.path.join(model_dir, "merged")
model_parent_dir = os.path.dirname(model_dir)
# Copy tensorboard logs
Expand Down

0 comments on commit bdc362f

Please sign in to comment.