Skip to content

Commit

Permalink
Default to using tokenizer chat template if available
Browse files Browse the repository at this point in the history
  • Loading branch information
chiragjn committed Jun 13, 2024
1 parent 872152d commit 33bb49d
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN mkdir -p /packages && \
cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
git checkout 5ba183d302ed1c91912555b76e423786acaccae8
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/requirements.txt && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile-notebook
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ USER jovyan
RUN cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
git checkout 5ba183d302ed1c91912555b76e423786acaccae8
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
Expand Down
2 changes: 1 addition & 1 deletion config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ adapter: qlora
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
debug: False
micro_batch_size: 1
revision_of_model:
revision_of_model: null
sequence_len: 2048
val_set_size: 0.1
## Added by TrueFoundry, not native to Axolotl
Expand Down
15 changes: 8 additions & 7 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class DatasetType(str, enum.Enum):


def _make_dataset_file_source(
path, split="train", dataset_type: DatasetType = DatasetType.completion, chat_template: str = "chatml"
path,
split="train",
dataset_type: DatasetType = DatasetType.completion,
):
"""
Axolotl dynamically loads prompt strategies based on the `type` key
Expand Down Expand Up @@ -56,7 +58,6 @@ def _make_dataset_file_source(
"path": path,
"ds_type": "json",
"type": "chat_template",
"chat_template": chat_template,
"field_messages": "messages",
"message_field_role": "role",
"message_field_content": "content",
Expand All @@ -68,7 +69,9 @@ def _make_dataset_file_source(


def dataset_uri_to_axolotl_datasources(
uri, download_dir, dataset_type: DatasetType = DatasetType.completion, chat_template: str = "chatml"
uri,
download_dir,
dataset_type: DatasetType = DatasetType.completion,
):
# TODO: Add support for HF datasets
if uri.startswith("https://"):
Expand All @@ -88,11 +91,9 @@ def dataset_uri_to_axolotl_datasources(
datasources = []
if os.path.isdir(uri):
for filepath in find_all_jsonl_files(uri):
datasources.append(
_make_dataset_file_source(path=filepath, dataset_type=dataset_type, chat_template=chat_template)
)
datasources.append(_make_dataset_file_source(path=filepath, dataset_type=dataset_type))
else:
datasources = [_make_dataset_file_source(path=uri, dataset_type=dataset_type, chat_template=chat_template)]
datasources = [_make_dataset_file_source(path=uri, dataset_type=dataset_type)]
return datasources
else:
raise ValueError("Unsupported data uri or path does not exist: {uri}")
Expand Down
6 changes: 2 additions & 4 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):

if cfg.chat_template == "auto":
model_type = getattr(model_hf_config, "model_type", None)
chat_template = MODEL_TYPE_TO_CHAT_TEMPLATE.get(model_type, "chatml")
chat_template = "tokenizer_default_fallback_" + MODEL_TYPE_TO_CHAT_TEMPLATE.get(model_type, "chatml")
set_cfg_option_if_auto(cfg, "chat_template", chat_template)

if cfg.datasets == "auto":
Expand All @@ -188,15 +188,13 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
uri=cfg.train_data_uri,
download_dir=cfg.data_dir,
dataset_type=cfg.dataset_type,
chat_template=cfg.chat_template,
)
if cfg.test_datasets == "auto":
if cfg.val_data_uri and str(cfg.val_data_uri).lower() != "na":
cfg.test_datasets = dataset_uri_to_axolotl_datasources(
uri=cfg.val_data_uri,
download_dir=cfg.data_dir,
dataset_type=cfg.dataset_type,
chat_template=chat_template,
)
elif cfg.val_set_size:
set_cfg_option_if_auto(cfg, "test_datasets", None, force=True)
Expand All @@ -220,7 +218,7 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
cfg["special_tokens"]["pad_token"] = tokenizer.eos_token
set_cfg_option_if_auto(cfg, "lora_modules_to_save", [])
logger.info(f"Prepared config: {cfg}")
# This hack is needed because yaml dump refuses to tread DictDefault as dict
# This hack is needed because yaml dump refuses to treat DictDefault as dict
yaml.add_representer(
DictDefault, lambda dumper, data: dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
)
Expand Down

0 comments on commit 33bb49d

Please sign in to comment.