In [1]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from datasets import load_dataset

# Load Llama model
model_name = "unsloth/llama-3.2-3b"  # Adjust this if needed
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Apply LoRA
model = FastLanguageModel.get_peft_model(
    base_model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.349 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.22.post7. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.10.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [2]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [3]:
sample_size = 100

dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.select(range(sample_size))

In [4]:
base_model = FastLanguageModel.for_inference(base_model)

In [5]:
from datasets import Dataset

single_example = dataset[0]
print("Training Example:", single_example)

mini_dataset = Dataset.from_dict({
    "instruction": [single_example["instruction"]],
    "input": [single_example["input"]],
    "output": [single_example["output"]]
})

Training Example: {'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'input': '', 'instruction': 'Give three tips for staying healthy.'}


In [6]:
def get_base_model_output(base_model, tokenizer, instruction, input_text):
    # Format with full instruction and input
    full_prompt = alpaca_prompt.format(
        instruction,
        input_text,
        ""  # Empty response section
    )
    
    # Generate output from base model (without LoRA)
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
    outputs = base_model.generate(**inputs, max_new_tokens=256, temperature=0.0)
    reference_output = tokenizer.batch_decode(outputs)[0]
    return reference_output

reference_outputs = []
for i, example in enumerate(mini_dataset):
    reference_output = get_base_model_output(
        base_model,
        tokenizer,
        example["instruction"],
        example["input"]
)
    print("Example:", example, "\n")
    print("Reference Output:", reference_output)
    reference_outputs.append(reference_output)

mini_dataset = mini_dataset.add_column("base_output", reference_outputs)

Example: {'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'} 

Reference Output: <|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a respon

In [7]:
mini_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'base_output'],
    num_rows: 1
})

In [8]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["base_output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [9]:
mini_dataset = mini_dataset.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 1/1 [00:00<00:00, 216.64 examples/s]


In [10]:
mini_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'base_output', 'text'],
    num_rows: 1
})

Training Experiments

In [11]:
# # RESET model to original state
# model = FastLanguageModel.from_pretrained(
#         model_name=model_name,
#         max_seq_length=2048,
#         dtype=None,
#         load_in_4bit=True,
#     )

# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj",],
#     lora_alpha = 16,
#     lora_dropout = 0, # Supports any, but = 0 is optimized
#     bias = "none",    # Supports any, but = "none" is optimized
#     use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
#     random_state = 3407,
#     use_rslora = False,  # We support rank stabilized LoRA
#     loftq_config = None, # And LoftQ
# )

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

max_seq_length = 4096
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = mini_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    # packing = True, # Packs short sequences together to save time!
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 10,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map: 100%|██████████| 1/1 [00:00<00:00, 247.83 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
minimal_prompt = "Generate a response:\n"
minimal_input = ""

FastLanguageModel.for_inference(base_model)
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        minimal_prompt, # instruction
        minimal_input, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

base_outputs = base_model.generate(**inputs, max_new_tokens = 64, use_cache = True)
lora_outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print("Base Output:", tokenizer.batch_decode(base_outputs)[0])
print("LoRA Output:", tokenizer.batch_decode(lora_outputs)[0])

Base Output: <|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a response:


### Input:


### Response:
<|end_of_text|>
LoRA Output: <|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a response:


### Input:


### Response:
<|end_of_text|>


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,1.2652
2,1.2652
3,1.2618
4,1.2587
5,1.2559
6,1.2468
7,1.2456
8,1.2396
9,1.2378
10,1.2334


In [15]:
minimal_prompt = "Generate a response:\n"

test_input = tokenizer(minimal_prompt, return_tensors="pt").to("cuda")
lora_outputs = model.generate(**test_input, max_new_tokens=256)
lora_output = tokenizer.batch_decode(lora_outputs)[0]

print("\nComparison:")
print("Original Output:", reference_output, "\n")
print("-"*100)
print("LoRA Output (no instruction):", lora_output)

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)