In [1]:
# Usage: deepspeed train_lora.py --deepspeed <$PATH_TO_DEEPSPEED_CONFIG>

# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

from dataclasses import dataclass, field
import logging
import pathlib
import typing

from deepspeed import zero
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, get_peft_model_state_dict
import transformers
from transformers import Trainer

import torch

from fastchat.train.train import (
    DataArguments,
    ModelArguments,
    TrainingArguments,
    make_supervised_data_module,
)

# from fastchat.train.llama_flash_attn_monkey_patch import (
#     replace_llama_attn_with_flash_attn,
# )

# replace_llama_attn_with_flash_attn()


@dataclass
class LoraArguments:
    lora_r: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_target_modules: typing.List[str] = field(
        default_factory=lambda: ["q_proj", "v_proj"]
    )
    lora_weight_path: str = ""
    lora_bias: str = "none"


def maybe_zero_3(param):
    if hasattr(param, "ds_id"):
        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
        with zero.GatheredParameters([param]):
            param = param.data.detach().cpu().clone()
    else:
        param = param.detach().cpu().clone()
    return param


# Borrowed from peft.utils.get_peft_model_state_dict
def get_peft_state_maybe_zero_3(named_params, bias):
    if bias == "none":
        to_return = {k: t for k, t in named_params if "lora_" in k}
    elif bias == "all":
        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
    elif bias == "lora_only":
        to_return = {}
        maybe_lora_bias = {}
        lora_bias_names = set()
        for k, t in named_params:
            if "lora_" in k:
                to_return[k] = t
                bias_name = k.split("lora_")[0] + "bias"
                lora_bias_names.add(bias_name)
            elif "bias" in k:
                maybe_lora_bias[k] = t
        for k, t in maybe_lora_bias:
            if bias_name in lora_bias_names:
                to_return[bias_name] = t
    else:
        raise NotImplementedError
    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
    return to_return


model_args = ModelArguments(model_name_or_path='TheBloke/vicuna-13B-1.1-HF')
data_args = DataArguments(data_path='/home/jupyter/therapy-bot/data/prompts.json', lazy_preprocess=True)
training_args = TrainingArguments(
    # _n_gpu=4,
    adafactor=False,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    auto_find_batch_size=False,
    bf16=False,
    bf16_full_eval=False,
    cache_dir=None,
    data_seed=None,
    dataloader_drop_last=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
    ddp_backend=None,
    ddp_bucket_cap_mb=None,
    ddp_find_unused_parameters=None,
    ddp_timeout=1800,
    debug=[],
    deepspeed=None,
    disable_tqdm=False,
    do_eval=True,
    do_predict=False,
    do_train=False,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=5,
    evaluation_strategy='steps',
    fp16=True,
    fp16_backend='auto',
    fp16_full_eval=False,
    fp16_opt_level='O1',
    fsdp=[],
    fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
    fsdp_min_num_params=0,
    fsdp_transformer_layer_cls_to_wrap=None,
    full_determinism=False,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    greater_is_better=None,
    group_by_length=False,
    half_precision_backend='cuda_amp',
    hub_model_id=None,
    hub_private_repo=False,
    hub_strategy='every_save',
    hub_token='<HUB_TOKEN>',
    ignore_data_skip=False,
    include_inputs_for_metrics=False,
    jit_mode_eval=False,
    label_names=None,
    label_smoothing_factor=0.0,
    learning_rate=2e-05,
    length_column_name='length',
    load_best_model_at_end=True,
    local_rank=0,
    log_level='passive',
    log_level_replica='warning',
    log_on_each_node=True,
    logging_dir='output_13b/runs/May25_11-25-05_zenai',
    logging_first_step=False,
    logging_nan_inf_filter=True,
    logging_steps=1.0,
    logging_strategy='steps',
    lr_scheduler_type='cosine',
    max_grad_norm=1.0,
    max_steps=-1,
    metric_for_best_model=None,
    model_max_length=1024,
    # mp_parameters=,
    no_cuda=False,
    num_train_epochs=3.0,
    optim='adamw_torch',
    optim_args=None,
    output_dir='output_13b',
    overwrite_output_dir=False,
    past_index=-1,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    prediction_loss_only=False,
    push_to_hub=False,
    push_to_hub_model_id=None,
    push_to_hub_organization=None,
    push_to_hub_token='<PUSH_TO_HUB_TOKEN>',
    ray_scope='last',
    remove_unused_columns=True,
    report_to=['tensorboard', 'wandb'],
    resume_from_checkpoint=None,
    run_name='output_13b',
    save_on_each_node=False,
    save_safetensors=False,
    save_steps=5,
    save_strategy='steps',
    save_total_limit=3,
    seed=42,
    sharded_ddp=[],
    skip_memory_metrics=True,
    tf32=False,
    torch_compile=False,
    torch_compile_backend=None,
    torch_compile_mode=None,
    torchdynamo=None,
    tpu_metrics_debug=False,
    tpu_num_cores=None,
    use_ipex=False,
    use_legacy_prediction_loop=False,
    use_mps_device=False,
    warmup_ratio=0.04,
    warmup_steps=0,
    weight_decay=0.0,
    xpu_backend=None,
)
lora_args = LoraArguments(lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_target_modules=['q_proj', 'v_proj'], lora_weight_path='', lora_bias='none')

model = transformers.LlamaForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
lora_config = LoraConfig(
    r=lora_args.lora_r,
    lora_alpha=lora_args.lora_alpha,
    target_modules=lora_args.lora_target_modules,
    lora_dropout=lora_args.lora_dropout,
    bias=lora_args.lora_bias,
    task_type="CAUSAL_LM",
)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

tokenizer = transformers.LlamaTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=False,
)
tokenizer.pad_token = tokenizer.unk_token

data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)

trainer = Trainer(
    model=model, tokenizer=tokenizer, args=training_args, **data_module
)

model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

model = torch.compile(model)

if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
    trainer.train(resume_from_checkpoint=True)
else:
    trainer.train()
trainer.save_state()

# Save states. Weights might be a placeholder in zero3 and need a gather
state_dict = get_peft_state_maybe_zero_3(
    model.named_parameters(), lora_args.lora_bias
)
model.save_pretrained(training_args.output_dir, state_dict=state_dict)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/envs/py310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



trainable params: 6553600 || all params: 13022417920 || trainable%: 0.05032552357220002


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkmnis[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
5,1.4218,1.342335
10,1.3831,1.338915
15,1.4322,1.339291
20,1.423,1.33443
25,1.3822,1.326463
30,1.3916,1.315927
35,1.5564,1.310128
40,1.2928,1.292895
45,1.4084,1.267482
50,1.1659,1.239097




In [3]:
trainer.save_state()

# Save states. Weights might be a placeholder in zero3 and need a gather
state_dict = get_peft_state_maybe_zero_3(
    model.named_parameters(), lora_args.lora_bias
)
model.save_pretrained(training_args.output_dir, state_dict=state_dict)

In [4]:
from huggingface_hub import notebook_login
 
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model.push_to_hub("kmnis/ZenAI", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/443 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kmnis/ZenAI/commit/d5bb11f78ab247fcf128d6e297491726f4458258', commit_message='Upload model', commit_description='', oid='d5bb11f78ab247fcf128d6e297491726f4458258', pr_url=None, pr_revision=None, pr_num=None)