In [1]:
import torch
from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
from transformers import BitsAndBytesConfig
import unsloth.memory_utils as memory_utils
import unsloth.data_utils as data_utils
from unsloth.kernels import fused_cel
from unsloth.models._utils import patch_tokenizer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import logging

logging.basicConfig(level=logging.INFO)
%load_ext autoreload
%autoreload 2

In [2]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_config = LlamaConfig.from_pretrained("./llama-10m.json")
# ref_model = LlamaForCausalLM(model_config).to("cuda")
small_model = LlamaForCausalLM(model_config).to("cuda")

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", quantization_config=quant_config
)
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", model_max_length=4096, padding_side="right"
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model, tokenizer = patch_tokenizer(model, tokenizer)

meta-llama/Llama-2-7b-chat-hf does not have a padding token! Will use pad_token = <unk>.


In [4]:
bs, seqlen, in_features = 1, 16, 4096

input_ids = torch.randint(0, model.config.vocab_size, (bs, seqlen), device="cuda")
labels = input_ids.detach().clone()
attention_mask = torch.ones((bs, seqlen), device="cuda")

ref_out = model(input_ids, labels=labels, attention_mask=attention_mask)

In [5]:
from unsloth.kernels.fused_cel import patch_model as patch_model_fused_cel

fused_model = patch_model_fused_cel(model, use_fused_cel=True)
fused_out = fused_model(input_ids, labels=labels, attention_mask=attention_mask)



In [6]:
ref_out.loss, fused_out.loss
ref_out.loss.dtype, fused_out.loss.dtype

(tensor(12.6158, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(12.6158, device='cuda:0',
        grad_fn=<FusedCrossEntropyLossFunctionBackward>))

(torch.float32, torch.float32)

In [18]:
out

CausalLMOutputWithPast(loss=tensor(10.3616, device='cuda:0',
       grad_fn=<FusedCrossEntropyLossFunctionBackward>), logits=None, past_key_values=((tensor([[[[ 0.1497,  0.1464,  0.0359,  ..., -0.2845,  0.1901, -0.0329],
          [ 0.1350,  0.2444, -0.0649,  ...,  0.2775, -0.5182, -0.0302],
          [-0.1150, -0.0163, -0.4294,  ...,  0.1441, -0.4375,  0.3417],
          ...,
          [-0.0815, -0.1239,  0.0535,  ..., -0.1271, -0.1182,  0.2920],
          [ 0.0310, -0.1295,  0.1272,  ...,  0.0997, -0.1399,  0.0415],
          [ 0.4750,  0.3073, -0.0646,  ...,  0.0314,  0.0284,  0.1492]],

         [[-0.3523,  0.0220,  0.1917,  ...,  0.0104, -0.2017,  0.6086],
          [-0.2150,  0.1729, -0.0763,  ...,  0.1541, -0.1428, -0.3597],
          [-0.0369,  0.0356,  0.1930,  ..., -0.0281, -0.3698, -0.2495],
          ...,
          [-0.1349,  0.1994,  0.0579,  ...,  0.3320, -0.1827,  0.1457],
          [-0.1372, -0.0760,  0.2372,  ...,  0.0172,  0.0316, -0.0693],
          [ 0.1217,  0.4477

In [7]:
out_fused = model.forward_fused_cel(
    input_ids, labels=labels, attention_mask=attention_mask
)

AttributeError: 'LlamaConfig' object has no attribute 'use_fused_cel'

In [15]:
dataset = data_utils.get_alpaca(tokenizer)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer

max_seq_length = 4096

training_args = TrainingArguments(
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    max_steps=5,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    # Metrics
    skip_memory_metrics=False,
    include_num_input_tokens_seen=True,
    include_tokens_per_second=True,
)

# trainer = Trainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset,
#     dataset_text_field = "text",
#     max_seq_length = max_seq_length,
#     dataset_num_proc = 2,
#     packing = False, # Can make training 5x faster for short sequences.
#     args = training_args)

In [22]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_args,
)

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [23]:
dataloader = trainer.get_train_dataloader()

In [24]:
batch = next(iter(dataloader))

In [28]:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]

In [31]:
(input_ids[0] != 0).sum()

tensor(107, device='cuda:0')

In [30]:
(attention_mask[0] != 0).sum()

tensor(107, device='cuda:0')