In [None]:
# @title Installation
import re
import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [18]:
# @title Import  libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported

In [None]:
# @title Define Model  and tokenizer
# Use the Gemma-2b-it model for instruction fine-tuning
model_name = "google/gemma-2b-it"

# Get the recommended maximum sequence length from Unsloth
max_seq_length = 512

# Get model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)


In [20]:
# @title Call LoRA Adapter
# --- FIX: Manually set the chat template for the tokenizer ---
# The Gemma model uses the ChatML format.
# Unsloth doesn't automatically add this for the standard Trainer workflow.
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '[USER]' + message['content'] + '[END_USER]' }}{% elif message['role'] == 'assistant' %}{{ '[ASSISTANT]'  + message['content'] + '[END_ASSISTANT]' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[ASSISTANT]' }}{% endif %}"
# --- End of fix ---

# Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
)

In [66]:
# @title Load  dataset and Define formating function
# Load the Dolly dataset
# On reall Finetune remember to remove the the capping of 100 data
dataset = load_dataset("databricks/databricks-dolly-15k", split="train[:100]")


# Define a function to format and tokenize it
def format_and_tokenize(examples):
    formatted_texts = []
    for instruction, output in zip(examples["instruction"], examples["response"]):
        # Use the tokenizer's chat template for correct formatting
        chat_template = tokenizer.apply_chat_template([
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output},
        ], tokenize=False, add_generation_prompt=False)
        formatted_texts.append(chat_template)

    # Tokenize the formatted text with padding and truncation
    return tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
    )

# Map the dataset to format and tokenize the conversations
tokenized_dataset = dataset.map(
    format_and_tokenize,
    batched=True,
    remove_columns=["instruction", "context", "response", "category"]
)

# A data collator is needed to handle padding and create labels
# Note: For Causal Language Modeling, labels are handled by shifting the input_ids
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# @title Check the data formated
sample = tokenized_dataset[0]
print(tokenizer.decode(sample["input_ids"]))

In [None]:
# @title Training args and Traner
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    num_train_epochs=1, # 3 for serious training
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="none",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    args=training_args,
)


In [None]:
# @title Start Training
trainer.train()

In [None]:
# @title Test
from unsloth import FastLanguageModel
model = FastLanguageModel.for_inference(model)
message =[
  {
      "role": "user",
      "content": "continue the fibonacci sequesnce! Your input is 1, 1, 2, 3, 5, 8,",
  }
]


# Apply chat template first to get the formatted text
formatted_text = tokenizer.apply_chat_template(
    message,
    add_generation_prompt=True,
    tokenize=False  # Get text instead of tokens
)

# Then tokenize with attention mask
inputs = tokenizer(
    formatted_text,
    return_tensors="pt",
    return_attention_mask=True
).to("cuda")

# Generate using the inputs dict
_ = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    streamer=text_streamer,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id
)
