In [9]:
import torch
from transformers import LlamaTokenizer, AutoTokenizer
from transformers.models.llama import LlamaForCausalLM, LlamaConfig
from transformers import BitsAndBytesConfig
import unsloth.memory_utils as memory_utils
import unsloth.data_utils as data_utils
from unsloth.kernels import fused_cel
from unsloth.models._utils import patch_tokenizer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import logging

logging.basicConfig(level=logging.INFO)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_config = LlamaConfig.from_pretrained("./llama-10m.json")
model = LlamaForCausalLM(model_config).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", model_max_length=4096, padding_side="right"
)



In [12]:
model, tokenizer = patch_tokenizer(model, tokenizer)

In [14]:
model.config.pad_token_id

0

In [4]:
bs, seqlen, in_features = 1, 16, 4096

input_ids = torch.randint(0, model.config.vocab_size, (bs, seqlen), device="cuda")
labels = input_ids.detach().clone()
attention_mask = torch.ones((bs, seqlen), device="cuda")

In [5]:
# model.config.update({
#     "use_fused_cel": True})
import types

model.forward_fused_cel = types.MethodType(fused_cel.forward, model)

In [6]:
out = model(input_ids, labels=labels, attention_mask=attention_mask)

In [7]:
out_fused = model.forward_fused_cel(
    input_ids, labels=labels, attention_mask=attention_mask
)

AttributeError: 'LlamaConfig' object has no attribute 'use_fused_cel'

In [15]:
dataset = data_utils.get_alpaca(tokenizer)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer

max_seq_length = 4096

training_args = TrainingArguments(
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    max_steps=5,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    # Metrics
    skip_memory_metrics=False,
    include_num_input_tokens_seen=True,
    include_tokens_per_second=True,
)

# trainer = Trainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset,
#     dataset_text_field = "text",
#     max_seq_length = max_seq_length,
#     dataset_num_proc = 2,
#     packing = False, # Can make training 5x faster for short sequences.
#     args = training_args)

In [22]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_args,
)

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [23]:
dataloader = trainer.get_train_dataloader()

In [24]:
batch = next(iter(dataloader))

In [28]:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]

In [31]:
(input_ids[0] != 0).sum()

tensor(107, device='cuda:0')

In [30]:
(attention_mask[0] != 0).sum()

tensor(107, device='cuda:0')