In [3]:
import torch
from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
from transformers import BitsAndBytesConfig
import unsloth.utils.memory as memory_utils
import unsloth.utils.data as data_utils
import unsloth.utils.testing as test_utils
from unsloth.kernels import fused_cel
from unsloth.models._utils import patch_tokenizer
from unsloth.models.llama import FastLlamaModel
from unsloth.kernels.fused_cel import patch_model as patch_model_fused_cel

from llama_head import CEL_only_forward

import logging
from IPython.core.interactiveshell import InteractiveShell


logging.basicConfig(level=logging.INFO)


InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# model_id = "meta-llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, quantization_config=quant_config
# )

model_config = LlamaConfig.from_pretrained("./llama-10m.json")
model = AutoModelForCausalLM.from_pretrained(
    "./llama-10m", quantization_config=quant_config, torch_dtype=torch.bfloat16
)
# model = LlamaForCausalLM(model_config).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", model_max_length=4096, padding_side="right"
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.




In [5]:
model, tokenizer = patch_tokenizer(model, tokenizer)

./llama-10m does not have a padding token! Will use pad_token = <unk>.


In [6]:
model.lm_head.weight.dtype
model.model.layers[0].mlp.gate_proj.quant_state.dtype

torch.bfloat16

torch.bfloat16

In [7]:
bs, seqlen, in_features = 1, 16, 4096
dtype = torch.bfloat16
hidden_dim = model.config.hidden_size

input_ids = torch.randint(0, model.config.vocab_size, (bs, seqlen), device="cuda")
hidden_states = torch.randn(
    bs, seqlen, hidden_dim, dtype=dtype, device="cuda", requires_grad=True
)
labels = input_ids.detach().clone()
attention_mask = torch.ones((bs, seqlen), device="cuda")

# ref_out = model(input_ids, labels=labels, attention_mask=attention_mask)
# ref_head = model.lm_head

In [8]:
def run_forward_backward(model, hidden_states, labels):
    loss = CEL_only_forward(model, hidden_states=hidden_states, labels=labels)
    loss.backward()
    dX, dW = hidden_states.grad, model.lm_head.weight.grad
    return loss, dX, dW

In [9]:
# original_model = patch_model_fused_cel(model, use_fused_cel=False)
# ref_loss, ref_dX, ref_dW = run_forward_backward(original_model, hidden_states, labels)
# fused_model = patch_model_fused_cel(model, use_fused_cel=True)
# fused_loss, fused_dX, fused_dW = run_forward_backward(model, hidden_states, labels)

In [10]:
# test_utils.check_all([ref_loss, ref_dX, ref_dW], [fused_loss, fused_dX, fused_dW])

In [11]:
dataset = data_utils.get_alpaca(tokenizer)

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer

max_seq_length = 256

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    max_steps=5,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    # Metrics
    skip_memory_metrics=False,
    include_num_input_tokens_seen=True,
    include_tokens_per_second=True,
)

# trainer = Trainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset,
#     dataset_text_field = "text",
#     max_seq_length = max_seq_length,
#     dataset_num_proc = 2,
#     packing = False, # Can make training 5x faster for short sequences.
#     args = training_args)

In [13]:
from transformers import Trainer
from peft import LoraConfig

accepted_modules = frozenset(
    (
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ),
)
peft_config = LoraConfig(
    target_modules=accepted_modules,
    lora_alpha=8,
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [14]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_args,
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer.get_optimizer_cls_and_kwargs(training_args)
trainer.get_num_trainable_parameters()
model.num_parameters()

(bitsandbytes.optim.adamw.AdamW,
 {'lr': 0.0002,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'optim_bits': 8,
  'is_paged': False})

78848

9074816

In [16]:
train_stats = trainer.train()

  0%|          | 0/5 [00:00<?, ?it/s]

{'loss': 10.4053, 'grad_norm': 0.49609375, 'learning_rate': 4e-05, 'epoch': 0.0, 'num_input_tokens_seen': 512}
{'loss': 10.4174, 'grad_norm': 0.40234375, 'learning_rate': 8e-05, 'epoch': 0.0, 'num_input_tokens_seen': 1024}
{'loss': 10.3617, 'grad_norm': 0.5234375, 'learning_rate': 0.00012, 'epoch': 0.0, 'num_input_tokens_seen': 1218}
{'loss': 10.4249, 'grad_norm': 0.484375, 'learning_rate': 0.00016, 'epoch': 0.0, 'num_input_tokens_seen': 1730}
{'loss': 10.3909, 'grad_norm': 0.55078125, 'learning_rate': 0.0, 'epoch': 0.0, 'num_input_tokens_seen': 1920}
{'train_runtime': 2.9042, 'train_samples_per_second': 3.443, 'train_steps_per_second': 1.722, 'train_tokens_per_second': 881.475, 'train_loss': 10.400027847290039, 'init_mem_cpu_alloc_delta': 4096, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 552894464, 'train_mem_gpu_alloc_delta': 17676288, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 2208977

In [18]:
trainer.model = patch_model_fused_cel(trainer.model, use_fused_cel=True)

In [19]:
fused_stats = trainer.train()

  0%|          | 0/5 [00:00<?, ?it/s]



RuntimeError: mat1 and mat2 shapes cannot be multiplied (510x32000 and 128x32000)

: 

In [23]:
dataloader = trainer.get_train_dataloader()
batch = next(iter(dataloader))

In [28]:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]

In [31]:
(input_ids[0] != 0).sum()

tensor(107, device='cuda:0')

In [30]:
(attention_mask[0] != 0).sum()

tensor(107, device='cuda:0')