In [12]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import utils
import os
import datasets
import prepare_dataset

utils.reload()

In [3]:
utils.set_gpu_ids([0])

os.environ["CUDA_VISIBLE_DEVICES"] = 0


In [13]:
lr = 2e-4
batch_size = 128
num_epochs = 10

In [4]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=256,
    lora_alpha=256,
    lora_dropout=0.05,
)

In [8]:
tokenizer = utils.get_complete_tokenizer()
model = AutoModelForCausalLM.from_pretrained(
    utils.default_model_path, low_cpu_mem_usage=True
)

LlamaTokenizerFast(name_or_path='/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--hf-internal-testing--llama-tokenizer/snapshots/99eceeba6e8289bee767f0771166b5917e70e470', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 134,217,728 || all params: 6,872,633,344 || trainable%: 1.9529301401940176


In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(utils.models_root, "peft/test"),
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
encoded_datasets = datasets.load_from_disk(utils.encoded_datasets_path)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=utils.get_data_collator(),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()