In [None]:
import os

from huggingface_hub import login
from omegaconf import OmegaConf

from lema.builders import (
    build_dataset,
    build_model,
    build_peft_model,
    build_tokenizer,
    build_trainer,
)
from lema.core.types import TrainingConfig
from lema.utils.saver import save_model

%load_ext autoreload
%autoreload 2

In [None]:
access_token = os.environ.get("HF_TOKEN")
login(token=access_token)

In [None]:
config_filename = "../configs/lema/zephyr.7b.sft.yaml"
base_config = OmegaConf.structured(TrainingConfig)
file_config = TrainingConfig.from_yaml(config_filename)
config = OmegaConf.merge(base_config, file_config)
config: TrainingConfig = OmegaConf.to_object(config)
print(config.training)
print(config.peft)

In [None]:
# TODO-finalize in config file
config.peft.q_lora = False
config.training.per_device_train_batch_size = 1
config.training.max_steps = 2

In [None]:
tokenizer = build_tokenizer(config.model)
tokenizer

In [None]:
# if tokenizer.model_max_length > 100_000: # shall this condition be checked for diff.
#  Zephyr models? Now is not.

print("tokenizer.model_max_length", tokenizer.model_max_length)
print("tokenizer pad_token/eos_token", tokenizer.pad_token, tokenizer.eos_token)
print("tokenizer.padding_side", tokenizer.padding_side)
print("tokenizer.chat_template", tokenizer.chat_template)

In [None]:
# Load data & preprocessing
dataset = build_dataset(dataset_config=config.data, tokenizer=tokenizer)


if True:
    import numpy as np  # hack to subsample

    print(len(dataset))
    np.random.seed(1234)
    ridx = np.random.choice(len(dataset), 1024, replace=False)
    dataset = dataset.select(ridx)
    print(len(dataset))

dataset

In [None]:
# TODO - update our code base if we use optimum
# Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file. # noqa
# WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed. # TODO update in main repo # noqa

In [None]:
model = build_model(config)

In [None]:
tokenizer.all_special_tokens
tokenizer.encode("|system|")

# TODO Consider adding special tokens like '<|assistant|>', '<|system|>'
# via tokenizer.additional_special_tokens -- need to check Mistral

# from alignment team:
# tokenizer.encode("<|system|>")  # We already wrap <bos> and <eos>
# # in the chat template

# Future TODO.
# # For ChatML we need to add special tokens and resize the embedding layer
# if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path: # noqa
#     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) # noqa
#     model, tokenizer = setup_chat_format(model, tokenizer)
#     model_kwargs = None

In [None]:
if config.training.use_peft:
    model = build_peft_model(
        model, config.training.enable_gradient_checkpointing, config.peft
    )

if config.training.enable_gradient_checkpointing:
    model.enable_input_require_grads()

In [None]:
# TODO update if need be for accelerator
trainer_cls = build_trainer(config.training.trainer_type)

In [None]:
trainer = trainer_cls(
    model=model,
    tokenizer=tokenizer,
    args=config.training.to_hf(),
    train_dataset=dataset,
    **config.data.trainer_kwargs,
)

In [None]:
dataset[0]["text"]

In [None]:
trainer.train()

In [None]:
# Save final checkpoint & training state
trainer.save_state()

save_model(
    config=config,
    trainer=trainer,
)