In [None]:
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /private/var/folders/46/8yh56x491r1cj78tqrzzgzqc0000gn/T/pip-install-0g19sjco/unsloth_08f87026fa8f45a7aa92051a3c0c1559
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /private/var/folders/46/8yh56x491r1cj78tqrzzgzqc0000gn/T/pip-install-0g19sjco/unsloth_08f87026fa8f45a7aa92051a3c0c1559


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ModuleNotFoundError: No module named 'unsloth'

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
text = "You will only answer in nepali, here is the question: ‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§ï‡•á ‡§π‡•ã ?"
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(text, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 1000, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>You will only answer in nepali, here is the question: ‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§ï‡•á ‡§π‡•ã ?\n\nAnswer:\n\nStep 1/2\nFirst, we need to understand the question in Nepali. The question is asking for the name of the person who is asking the question. So, the question is: "Who are you?"\n\nStep 2/2\nNow, we can answer the question in Nepali: "‡§Æ‡•à‡§Ç ‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§ï‡•á ‡§π‡•ã‡•§" (I am yours, my name is yours.)<eos>']

<a name="Data"></a>

### Data Prep

We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `ChatML` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).


In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Context:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Context:
{}

### Input:
{}

### Response:
{}"""

inputs = tokenizer(
[
    prompt.format(
        "Given the context, you have to reply it to the user in NEPALI letters. Don't hallucinate or repeat same, just give the exact answer.", # instruction
        """‡§®‡•á‡§™‡§æ‡§≤ ‡§¶‡§ï‡•ç‡§∑‡§ø‡§£ ‡§è‡§∏‡§ø‡§Ø‡§æ‡§ï‡•ã ‡§è‡§â‡§ü‡§æ ‡§∏‡§æ‡§®‡•ã ‡§§‡§∞ ‡§™‡•ç‡§∞‡§æ‡§ï‡•É‡§§‡§ø‡§ï ‡§∞ ‡§∏‡§æ‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø‡§ï ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§∏‡§Æ‡•É‡§¶‡•ç‡§ß ‡§¶‡•á‡§∂ ‡§π‡•ã‡•§ ‡§Ø‡•ã ‡§¶‡•á‡§∂ ‡§µ‡§ø‡§∂‡•ç‡§µ‡§ï‡•ã ‡§∏‡§∞‡•ç‡§µ‡•ã‡§ö‡•ç‡§ö ‡§∂‡§ø‡§ñ‡§∞ ‡§∏‡§ó‡§∞‡§Æ‡§æ‡§•‡§æ‡§¨‡§æ‡§ü ‡§≤‡§ø‡§è‡§∞ ‡§§‡§∞‡§æ‡§à‡§ï‡§æ ‡§∏‡§Æ‡§•‡§∞ ‡§´‡§æ‡§Å‡§ü‡§∏‡§Æ‡•ç‡§Æ ‡§´‡•à‡§≤‡§ø‡§è‡§ï‡•ã ‡§õ‡•§ ‡§Ø‡§∏‡§ï‡•ã ‡§≠‡•å‡§ó‡•ã‡§≤‡§ø‡§ï ‡§µ‡§ø‡§µ‡§ø‡§ß‡§§‡§æ ‡§∏‡§Ç‡§∏‡§æ‡§∞‡§Æ‡§æ ‡§Ö‡§∞‡•Ç ‡§ï‡•Å‡§®‡•à ‡§¶‡•á‡§∂‡§∏‡§Å‡§ó ‡§§‡•Å‡§≤‡§®‡§æ ‡§ó‡§∞‡•ç‡§® ‡§∏‡§ï‡§ø‡§Å‡§¶‡•à‡§®‡•§ ‡§Ø‡§π‡§æ‡§Å ‡§§‡•Ä‡§® ‡§≠‡•å‡§ó‡•ã‡§≤‡§ø‡§ï ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§π‡§∞‡•Ç ‡§õ‡§®‡•ç‚Äî‡§π‡§ø‡§Æ‡§æ‡§≤, ‡§™‡§π‡§æ‡§°, ‡§∞ ‡§§‡§∞‡§æ‡§à‡•§ ‡§π‡§ø‡§Æ‡§æ‡§≤ ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§Æ‡§æ ‡§ß‡•á‡§∞‡•à ‡§™‡•ç‡§∞‡§∏‡§ø‡§¶‡•ç‡§ß ‡§π‡§ø‡§Æ‡§æ‡§≤‡§π‡§∞‡•Ç ‡§õ‡§®‡•ç, ‡§ú‡§∏‡•ç‡§§‡•à ‡§∏‡§ó‡§∞‡§Æ‡§æ‡§•‡§æ, ‡§Ö‡§®‡•ç‡§®‡§™‡•Ç‡§∞‡•ç‡§£, ‡§∞ ‡§Æ‡§®‡§æ‡§∏‡•ç‡§≤‡•Å‡•§ ‡§Ø‡•Ä ‡§π‡§ø‡§Æ‡§æ‡§≤‡§π‡§∞‡•Ç‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡§æ‡§à ‡§ü‡•ç‡§∞‡•á‡§ï‡§ø‡§ô ‡§∞ ‡§™‡§∞‡•ç‡§µ‡§§‡§æ‡§∞‡•ã‡§π‡§£‡§ï‡•ã ‡§ó‡§®‡•ç‡§§‡§µ‡•ç‡§Ø‡§ï‡§æ ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§™‡§∞‡§ø‡§ö‡§ø‡§§ ‡§¨‡§®‡§æ‡§è‡§ï‡§æ ‡§õ‡§®‡•ç‡•§
        ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡•ã ‡§∏‡§æ‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø‡§ï ‡§µ‡§ø‡§µ‡§ø‡§ß‡§§‡§æ ‡§™‡§®‡§ø ‡§Ø‡§π‡§æ‡§Å‡§ï‡•ã ‡§™‡•ç‡§∞‡§Æ‡•Å‡§ñ ‡§™‡§π‡§ø‡§ö‡§æ‡§® ‡§π‡•ã‡•§ ‡§®‡•á‡§™‡§æ‡§≤‡§Æ‡§æ ‡•ß‡•®‡•´ ‡§≠‡§®‡•ç‡§¶‡§æ ‡§¨‡§¢‡•Ä ‡§ú‡§æ‡§§‡§ú‡§æ‡§§‡§ø ‡§∞ ‡•ß‡•®‡•© ‡§≠‡§æ‡§∑‡§æ‡§≠‡§æ‡§∑‡•Ä ‡§∏‡§Æ‡•Å‡§¶‡§æ‡§Ø ‡§¨‡§∏‡•ã‡§¨‡§æ‡§∏ ‡§ó‡§∞‡•ç‡§õ‡§®‡•ç‡•§ ‡§Ø‡•Ä ‡§∏‡§¨‡•à ‡§ú‡§æ‡§§‡§ú‡§æ‡§§‡§ø‡§ï‡§æ ‡§Ü‡§´‡•ç‡§®‡•à ‡§™‡§∞‡§Æ‡•ç‡§™‡§∞‡§æ, ‡§≠‡•á‡§∑‡§≠‡•Ç‡§∑‡§æ, ‡§∞ ‡§ö‡§æ‡§°‡§™‡§∞‡•ç‡§µ ‡§õ‡§®‡•ç‡•§ ‡§¶‡§∂‡•à‡§Ç ‡§∞ ‡§§‡§ø‡§π‡§æ‡§∞‡§ú‡§∏‡•ç‡§§‡§æ ‡§Æ‡•Å‡§ñ‡•ç‡§Ø ‡§ö‡§æ‡§°‡§™‡§∞‡•ç‡§µ ‡§∏‡§¨‡•à ‡§ú‡§æ‡§§‡•Ä‡§Ø ‡§∏‡§Æ‡•Å‡§¶‡§æ‡§Ø‡§≤‡•á ‡§Æ‡§®‡§æ‡§â‡§®‡•á ‡§≠‡§è ‡§™‡§®‡§ø, ‡§ï‡§ø‡§∞‡§æ‡§Å‡§§‡§π‡§∞‡•Ç‡§ï‡•ã ‡§â‡§≠‡•å‡§≤‡•Ä ‡§∞ ‡§â‡§ß‡•å‡§≤‡•Ä, ‡§§‡§æ‡§Æ‡§æ‡§ô‡§π‡§∞‡•Ç‡§ï‡•ã ‡§≤‡•ç‡§π‡•ã‡§∏‡§∞, ‡§∞ ‡§Æ‡§ß‡•á‡§∏‡•Ä ‡§∏‡§Æ‡•Å‡§¶‡§æ‡§Ø‡§ï‡•ã ‡§õ‡§†‡§ú‡§∏‡•ç‡§§‡§æ ‡§ö‡§æ‡§°‡§™‡§∞‡•ç‡§µ‡§≤‡•á ‡§∏‡§æ‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø‡§ï ‡§µ‡§ø‡§µ‡§ø‡§ß‡§§‡§æ‡§≤‡§æ‡§à ‡§ù‡§≤‡•ç‡§ï‡§æ‡§â‡§Å‡§õ‡§®‡•ç‡•§ ‡§ß‡§æ‡§∞‡•ç‡§Æ‡§ø‡§ï ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§™‡§®‡§ø ‡§®‡•á‡§™‡§æ‡§≤ ‡§¨‡§π‡•Å‡§ß‡§æ‡§∞‡•ç‡§Æ‡§ø‡§ï ‡§¶‡•á‡§∂ ‡§π‡•ã‡•§ ‡§Ø‡§π‡§æ‡§Å ‡§π‡§ø‡§®‡•ç‡§¶‡•Ç, ‡§¨‡•å‡§¶‡•ç‡§ß, ‡§á‡§∏‡•ç‡§≤‡§æ‡§Æ, ‡§ï‡•ç‡§∞‡§ø‡§∂‡•ç‡§ö‡§ø‡§Ø‡§®‡§ø‡§ü‡•Ä, ‡§∞ ‡§ï‡§ø‡§∞‡§æ‡§Å‡§§ ‡§ß‡§∞‡•ç‡§Æ‡§æ‡§µ‡§≤‡§Æ‡•ç‡§¨‡•Ä‡§π‡§∞‡•Ç ‡§∂‡§æ‡§®‡•ç‡§§‡§ø‡§™‡•Ç‡§∞‡•ç‡§£ ‡§∏‡§π-‡§Ö‡§∏‡•ç‡§§‡§ø‡§§‡•ç‡§µ‡§Æ‡§æ ‡§¨‡§∏‡•ç‡§õ‡§®‡•ç‡•§ ‡§™‡§∂‡•Å‡§™‡§§‡§ø‡§®‡§æ‡§• ‡§Æ‡§®‡•ç‡§¶‡§ø‡§∞, ‡§≤‡•Å‡§Æ‡•ç‡§¨‡§ø‡§®‡•Ä, ‡§∏‡•ç‡§µ‡§Ø‡§Æ‡•ç‡§≠‡•Ç‡§®‡§æ‡§•, ‡§∞ ‡§ú‡§®‡§ï‡§™‡•Å‡§∞‡§ß‡§æ‡§Æ ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡§æ ‡§™‡•ç‡§∞‡§∏‡§ø‡§¶‡•ç‡§ß ‡§ß‡§æ‡§∞‡•ç‡§Æ‡§ø‡§ï ‡§∏‡•ç‡§•‡§≤ ‡§π‡•Å‡§®‡•ç‡•§
        ‡§á‡§§‡§ø‡§π‡§æ‡§∏‡§ï‡•ã ‡§ï‡•Å‡§∞‡§æ ‡§ó‡§∞‡•ç‡§¶‡§æ, ‡§®‡•á‡§™‡§æ‡§≤ ‡§è‡§ï ‡§∏‡•ç‡§µ‡§§‡§®‡•ç‡§§‡•ç‡§∞ ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞ ‡§π‡•ã, ‡§ú‡•Å‡§® ‡§ï‡§π‡§ø‡§≤‡•ç‡§Ø‡•à ‡§™‡§®‡§ø ‡§ï‡•Å‡§®‡•à ‡§µ‡§ø‡§¶‡•á‡§∂‡•Ä ‡§â‡§™‡§®‡§ø‡§µ‡•á‡§∂ ‡§¨‡§®‡•á‡§ï‡•ã ‡§•‡§ø‡§è‡§®‡•§ ‡§™‡•ç‡§∞‡§æ‡§ö‡•Ä‡§® ‡§∏‡§Æ‡§Ø‡§Æ‡§æ ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡§æ‡§à "‡§®‡•á‡§µ‡§æ‡§∞" ‡§µ‡§æ "‡§®‡•á‡§™‡§æ‡§≤ ‡§Æ‡§£‡•ç‡§°‡§≤" ‡§≠‡§®‡•á‡§∞ ‡§ö‡§ø‡§®‡§ø‡§®‡•ç‡§•‡•ç‡§Ø‡•ã‡•§ ‡§ó‡•ã‡§∞‡•ç‡§ñ‡§æ ‡§∞‡§æ‡§ú‡•ç‡§Ø‡§ï‡§æ ‡§∞‡§æ‡§ú‡§æ ‡§™‡•É‡§•‡•ç‡§µ‡•Ä‡§®‡§æ‡§∞‡§æ‡§Ø‡§£ ‡§∂‡§æ‡§π‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡§æ‡§à ‡§è‡§ï‡•Ä‡§ï‡•É‡§§ ‡§ó‡§∞‡•Ä ‡§Ü‡§ß‡•Å‡§®‡§ø‡§ï ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡•ã ‡§∏‡•Å‡§∞‡•Å‡§µ‡§æ‡§§ ‡§ó‡§∞‡•á‡§ï‡§æ ‡§•‡§ø‡§è‡•§ ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡•á ‡•®‡•¶‡•¶‡•≠ ‡§∏‡§æ‡§≤‡§Æ‡§æ ‡§™‡•ç‡§∞‡§ú‡§æ‡§§‡§®‡•ç‡§§‡•ç‡§∞‡§ï‡•ã ‡§∏‡•Å‡§∞‡•Å‡§µ‡§æ‡§§ ‡§ó‡§∞‡•ç‚Äç‡§Ø‡•ã‡•§ ‡§§‡•ç‡§Ø‡§∏‡§™‡§õ‡§ø ‡•®‡•¶‡•¨‡•®/‡•¨‡•© ‡§ï‡•ã ‡§Ü‡§®‡•ç‡§¶‡•ã‡§≤‡§®‡§¨‡§æ‡§ü ‡§®‡•á‡§™‡§æ‡§≤ ‡§≤‡•ã‡§ï‡§§‡§æ‡§®‡•ç‡§§‡•ç‡§∞‡§ø‡§ï ‡§ó‡§£‡§§‡§®‡•ç‡§§‡•ç‡§∞ ‡§¨‡§®‡•á‡§ï‡•ã ‡§π‡•ã‡•§ ‡§π‡§æ‡§≤ ‡§®‡•á‡§™‡§æ‡§≤‡§Æ‡§æ ‡§∏‡§ô‡•ç‡§ò‡•Ä‡§Ø ‡§∏‡§Ç‡§∞‡§ö‡§®‡§æ‡§Æ‡§æ ‡§∏‡§æ‡§§‡§µ‡§ü‡§æ ‡§™‡•ç‡§∞‡§¶‡•á‡§∂‡§π‡§∞‡•Ç ‡§õ‡§®‡•ç‡•§
        ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ‡§ï‡•ã ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§Æ‡§æ ‡§™‡§®‡§ø ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡•á ‡§™‡§õ‡§ø‡§≤‡•ç‡§≤‡•ã ‡§∏‡§Æ‡§Ø‡§Æ‡§æ ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§™‡•ç‡§∞‡§ó‡§§‡§ø ‡§ó‡§∞‡§ø‡§∞‡§π‡•á‡§ï‡•ã ‡§õ‡•§ ‡§™‡•ç‡§∞‡§æ‡§•‡§Æ‡§ø‡§ï ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ‡§Æ‡§æ ‡§â‡§≤‡•ç‡§≤‡•á‡§ñ‡§®‡•Ä‡§Ø ‡§∏‡•Å‡§ß‡§æ‡§∞ ‡§≠‡§è ‡§™‡§®‡§ø ‡§â‡§ö‡•ç‡§ö ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ‡§Æ‡§æ ‡§Ö‡§ù‡•à ‡§∏‡•Å‡§ß‡§æ‡§∞ ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§®‡•á ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï‡§§‡§æ ‡§õ‡•§ ‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç ‡§µ‡§ø‡§∂‡•ç‡§µ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§≤‡§Ø, ‡§§‡•ç‡§∞‡§ø‡§≠‡•Å‡§µ‡§® ‡§µ‡§ø‡§∂‡•ç‡§µ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§≤‡§Ø, ‡§∞ ‡§™‡•Å‡§∞‡•ç‡§µ‡§æ‡§û‡•ç‡§ö‡§≤ ‡§µ‡§ø‡§∂‡•ç‡§µ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§≤‡§Ø‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§∞‡•ç‡§•‡•Ä‡§π‡§∞‡•Ç‡§≤‡§æ‡§à ‡§â‡§ö‡•ç‡§ö ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ‡§ï‡•ã ‡§Ö‡§µ‡§∏‡§∞ ‡§™‡•ç‡§∞‡§¶‡§æ‡§® ‡§ó‡§∞‡§ø‡§∞‡§π‡•á‡§ï‡§æ ‡§õ‡§®‡•ç‡•§ ‡§Ø‡§¶‡•ç‡§Ø‡§™‡§ø, ‡§µ‡§ø‡§ú‡•ç‡§û‡§æ‡§®, ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø, ‡§∞ ‡§Ö‡§®‡•Å‡§∏‡§®‡•ç‡§ß‡§æ‡§®‡§Æ‡§æ ‡§®‡•á‡§™‡§æ‡§≤ ‡§Ö‡§ù‡•à ‡§ß‡•á‡§∞‡•à ‡§™‡§õ‡§æ‡§°‡§ø ‡§õ‡•§
        ‡§®‡•á‡§™‡§æ‡§≤‡§Æ‡§æ ‡§ß‡•á‡§∞‡•à ‡§∏‡§æ‡§Æ‡§æ‡§ú‡§ø‡§ï ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ‡§π‡§∞‡•Ç ‡§™‡§®‡§ø ‡§õ‡§®‡•ç‡•§ ‡§¨‡•á‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞‡•Ä, ‡§Ö‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ, ‡§ó‡§∞‡§ø‡§¨‡•Ä, ‡§∞ ‡§≤‡•à‡§ô‡•ç‡§ó‡§ø‡§ï ‡§Ö‡§∏‡§Æ‡§æ‡§®‡§§‡§æ ‡§Ø‡§π‡§æ‡§Å‡§ï‡§æ ‡§™‡•ç‡§∞‡§Æ‡•Å‡§ñ ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ ‡§π‡•Å‡§®‡•ç‡•§ ‡§µ‡§ø‡§∂‡•á‡§∑‡§ó‡§∞‡•Ä, ‡§¶‡•Å‡§∞‡•ç‡§ó‡§Æ ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§ï‡§æ ‡§Æ‡§æ‡§®‡§ø‡§∏‡§π‡§∞‡•Ç‡§≤‡•á ‡§∏‡•ç‡§µ‡§æ‡§∏‡•ç‡§•‡•ç‡§Ø ‡§∏‡•á‡§µ‡§æ ‡§∞ ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ‡§Æ‡§æ ‡§Ö‡§ù‡•à ‡§™‡§®‡§ø ‡§™‡§∞‡•ç‡§Ø‡§æ‡§™‡•ç‡§§ ‡§™‡§π‡•Å‡§Å‡§ö ‡§™‡§æ‡§â‡§® ‡§∏‡§ï‡•á‡§ï‡§æ ‡§õ‡•à‡§®‡§®‡•ç‡•§ ‡§ú‡§≤‡§µ‡§æ‡§Ø‡•Å ‡§™‡§∞‡§ø‡§µ‡§∞‡•ç‡§§‡§®‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡•ã ‡§™‡•ç‡§∞‡§æ‡§ï‡•É‡§§‡§ø‡§ï ‡§∏‡•ç‡§∞‡•ã‡§§‡§Æ‡§æ ‡§®‡§ï‡§æ‡§∞‡§æ‡§§‡•ç‡§Æ‡§ï ‡§Ö‡§∏‡§∞ ‡§™‡§æ‡§∞‡•á‡§ï‡•ã ‡§õ‡•§ ‡§π‡§ø‡§Æ‡§®‡§¶‡•Ä‡§π‡§∞‡•Ç ‡§™‡§ó‡•ç‡§≤‡§Å‡§¶‡•à ‡§ú‡§æ‡§®‡•Å ‡§∞ ‡§Ö‡§®‡§ø‡§Ø‡§®‡•ç‡§§‡•ç‡§∞‡§ø‡§§ ‡§µ‡§® ‡§ï‡§ü‡§æ‡§®‡•Ä‡§≤‡•á ‡§™‡§∞‡•ç‡§Ø‡§æ‡§µ‡§∞‡§£‡•Ä‡§Ø ‡§Ö‡§∏‡§®‡•ç‡§§‡•Å‡§≤‡§® ‡§®‡§ø‡§Æ‡•ç‡§§‡•ç‡§Ø‡§æ‡§á‡§∞‡§π‡•á‡§ï‡•ã ‡§õ‡•§
        ‡§Ø‡§¶‡•ç‡§Ø‡§™‡§ø, ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø‡§ï‡•ã ‡§µ‡§ø‡§ï‡§æ‡§∏‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡§Æ‡§æ ‡§ß‡•á‡§∞‡•à ‡§∏‡§ï‡§æ‡§∞‡§æ‡§§‡•ç‡§Æ‡§ï ‡§™‡§∞‡§ø‡§µ‡§∞‡•ç‡§§‡§®‡§π‡§∞‡•Ç ‡§≤‡•ç‡§Ø‡§æ‡§è‡§ï‡•ã ‡§õ‡•§ ‡§á‡§®‡•ç‡§ü‡§∞‡§®‡•á‡§ü ‡§∞ ‡§Æ‡•ã‡§¨‡§æ‡§á‡§≤ ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø‡§≤‡•á ‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§ú‡§®‡§§‡§æ‡§ï‡•ã ‡§ú‡•Ä‡§µ‡§®‡§≤‡§æ‡§à ‡§∏‡§π‡§ú ‡§¨‡§®‡§æ‡§è‡§ï‡•ã ‡§õ‡•§ ‡§Æ‡•ã‡§¨‡§æ‡§á‡§≤ ‡§¨‡•à‡§Ç‡§ï‡§ø‡§ô, ‡§à-‡§ï‡§Æ‡§∞‡•ç‡§∏, ‡§∞ ‡§°‡§ø‡§ú‡§ø‡§ü‡§≤ ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ ‡§Ö‡§π‡§ø‡§≤‡•á ‡§≤‡•ã‡§ï‡§™‡•ç‡§∞‡§ø‡§Ø ‡§¨‡§®‡•ç‡§¶‡•à ‡§ó‡§á‡§∞‡§π‡•á‡§ï‡§æ ‡§õ‡§®‡•ç‡•§ ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡•á ‡§∏‡•Ç‡§ö‡§®‡§æ ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø‡§Æ‡§æ ‡§™‡•ç‡§∞‡§ó‡§§‡§ø ‡§ó‡§∞‡§ø‡§∞‡§π‡•á‡§ï‡•ã ‡§≠‡§è ‡§™‡§®‡§ø, ‡§°‡§ø‡§ú‡§ø‡§ü‡§≤ ‡§µ‡§ø‡§≠‡§æ‡§ú‡§®‡§ï‡•ã ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ ‡§Ö‡§ù‡•à ‡§¨‡§æ‡§Å‡§ï‡•Ä ‡§õ‡•§ ‡§ó‡•ç‡§∞‡§æ‡§Æ‡•Ä‡§£ ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§ï‡§æ ‡§Æ‡§æ‡§®‡§ø‡§∏‡§π‡§∞‡•Ç‡§≤‡•á ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø‡§ï‡•ã ‡§´‡§æ‡§á‡§¶‡§æ ‡§≤‡§ø‡§® ‡§∏‡§ï‡•á‡§ï‡§æ ‡§õ‡•à‡§®‡§®‡•ç‡•§
        ‡§Ø‡§∏‡§∞‡•Ä, ‡§®‡•á‡§™‡§æ‡§≤ ‡§™‡•ç‡§∞‡§æ‡§ï‡•É‡§§‡§ø‡§ï, ‡§∏‡§æ‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø‡§ï, ‡§∞ ‡§ê‡§§‡§ø‡§π‡§æ‡§∏‡§ø‡§ï ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§ß‡§®‡•Ä ‡§≠‡§è ‡§™‡§®‡§ø, ‡§Ø‡§π‡§æ‡§Å‡§ï‡§æ ‡§µ‡§ø‡§≠‡§ø‡§®‡•ç‡§® ‡§∏‡§æ‡§Æ‡§æ‡§ú‡§ø‡§ï ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ ‡§∏‡§Æ‡§æ‡§ß‡§æ‡§® ‡§ó‡§∞‡•ç‡§® ‡§Ö‡§ù‡•à ‡§ß‡•á‡§∞‡•à ‡§™‡•ç‡§∞‡§Ø‡§æ‡§∏ ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§®‡•á ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï‡§§‡§æ ‡§õ‡•§ ‡§∏‡§∞‡§ï‡§æ‡§∞‡•Ä ‡§§‡§•‡§æ ‡§ó‡•à‡§∞‡§∏‡§∞‡§ï‡§æ‡§∞‡•Ä ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§π‡§∞‡•Ç‡§≤‡•á ‡§Æ‡§ø‡§≤‡•á‡§∞ ‡§®‡•á‡§™‡§æ‡§≤‡§≤‡§æ‡§à ‡§Ü‡§∞‡•ç‡§•‡§ø‡§ï, ‡§∏‡§æ‡§Æ‡§æ‡§ú‡§ø‡§ï, ‡§∞ ‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø‡§ï‡•ã ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§Æ‡§æ ‡§Ö‡§ó‡§æ‡§°‡§ø ‡§¨‡§¢‡§æ‡§â‡§® ‡§Ø‡•ã‡§ó‡§¶‡§æ‡§® ‡§™‡•Å‡§∞‡•ç‚Äç‡§Ø‡§æ‡§â‡§®‡•Å ‡§Æ‡§π‡§§‡•ç‡§§‡•ç‡§µ‡§™‡•Ç‡§∞‡•ç‡§£ ‡§õ‡•§""", # context
        """‡§®‡•á‡§™‡§æ‡§≤‡§≤‡§æ‡§à ‡§≠‡•å‡§ó‡•ã‡§≤‡§ø‡§ï ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§ï‡•Å‡§® ‡§§‡•Ä‡§® ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§Æ‡§æ ‡§µ‡§ø‡§≠‡§æ‡§ú‡§® ‡§ó‡§∞‡§ø‡§è‡§ï‡•ã ‡§õ?""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 150, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the context, you have to reply it to the user in NEPALI letters. Don\'t hallucinate or repeat same, just give the exact answer.\n\n### Context:\n‡§®‡•á‡§™‡§æ‡§≤ ‡§¶‡§ï‡•ç‡§∑‡§ø‡§£ ‡§è‡§∏‡§ø‡§Ø‡§æ‡§ï‡•ã ‡§è‡§â‡§ü‡§æ ‡§∏‡§æ‡§®‡•ã ‡§§‡§∞ ‡§™‡•ç‡§∞‡§æ‡§ï‡•É‡§§‡§ø‡§ï ‡§∞ ‡§∏‡§æ‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø‡§ï ‡§∞‡•Ç‡§™‡§Æ‡§æ ‡§∏‡§Æ‡•É‡§¶‡•ç‡§ß ‡§¶‡•á‡§∂ ‡§π‡•ã‡•§ ‡§Ø‡•ã ‡§¶‡•á‡§∂ ‡§µ‡§ø‡§∂‡•ç‡§µ‡§ï‡•ã ‡§∏‡§∞‡•ç‡§µ‡•ã‡§ö‡•ç‡§ö ‡§∂‡§ø‡§ñ‡§∞ ‡§∏‡§ó‡§∞‡§Æ‡§æ‡§•‡§æ‡§¨‡§æ‡§ü ‡§≤‡§ø‡§è‡§∞ ‡§§‡§∞‡§æ‡§à‡§ï‡§æ ‡§∏‡§Æ‡§•‡§∞ ‡§´‡§æ‡§Å‡§ü‡§∏‡§Æ‡•ç‡§Æ ‡§´‡•à‡§≤‡§ø‡§è‡§ï‡•ã ‡§õ‡•§ ‡§Ø‡§∏‡§ï‡•ã ‡§≠‡•å‡§ó‡•ã‡§≤‡§ø‡§ï ‡§µ‡§ø‡§µ‡§ø‡§ß‡§§‡§æ ‡§∏‡§Ç‡§∏‡§æ‡§∞‡§Æ‡§æ ‡§Ö‡§∞‡•Ç ‡§ï‡•Å‡§®‡•à ‡§¶‡•á‡§∂‡§∏‡§Å‡§ó ‡§§‡•Å‡§≤‡§®‡§æ ‡§ó‡§∞‡•ç‡§® ‡§∏‡§ï‡§ø‡§Å‡§¶‡•à‡§®‡•§ ‡§Ø‡§π‡§æ‡§Å ‡§§‡•Ä‡§® ‡§≠‡•å‡§ó‡•ã‡§≤‡§ø‡§ï ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞‡§π‡§∞‡

<a name="Train"></a>

### Train the model

Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 240,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.428 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 240
 "-____-"     Number of trainable parameters = 19,611,648


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
1,1.8921
2,2.4707
3,2.0272
4,2.318
5,2.0937
6,2.0356
7,1.4849
8,1.7584
9,1.5985
10,1.6657


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

537.0524 seconds used for training.
8.95 minutes used for training.
Peak reserved memory = 5.689 GB.
Peak reserved memory for training = 3.261 GB.
Peak reserved memory % of max memory = 38.575 %.
Peak reserved memory for training % of max memory = 22.111 %.


<a name="Inference"></a>

### Inference

Let's run the model! You can change the instruction and input - leave the output blank!


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Context:
{}

### Input:
{}

### Response:
{}"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Given the context, you have to reply it to the user in NEPALI letters.", # instruction
        "‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§Æ ‡§π‡•ã", # context
        "‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§ï‡•á ‡§π‡•ã ?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the context, you have to reply it to the user in NEPALI letters.\n\n### Context:\n‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§Æ ‡§π‡•ã\n\n### Input:\n‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§ï‡•á ‡§π‡•ã ?\n\n### Response:\nYour name is Ram.<eos>']

You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1000)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
The next term in the Fibonacci sequence fter the given term 8 is 13.<eos>


<a name="Save"></a>

### Saving, loading finetuned models

To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!


In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:


In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

IndexError: Replacement index 3 out of range for positional args tuple

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.


In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.


In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion

To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):

- `q8_0` - Fast conversion. High resource use, but generally acceptable.
- `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
- `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.


In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).


And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:

1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with ü§ó HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>
