## Using Unsloth to finetune

## Install Prerequisite Packages

In [1]:
# This is necessary for colab
!pip install python-dotenv
!pip install datasets
!pip install plotly
!pip install nbformat
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0

Collecting xformers
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.12.1
    Uninstalling trl-0.12.1:
      Successfully uninstalled trl-0.12.1
Successfully installed trl-0.8.6 xformers-0.0.28.post3


## Load `.env`

In [19]:
import os
import sys
import json

from datasets import Dataset

from dotenv import find_dotenv, load_dotenv

# The file .env should contain the line (where xxxxxxx is replaced):
# export HUGGINGFACE_API_KEY=xxxxxxx
load_dotenv()

True

## Important Global Parameters

In [10]:
FINETUNING_DATASET_NAME="CPSC532/arxiv_qa_data"

with open('config.json') as f:
    config = json.load(f)
finetune_config = config["finetune_config"]
pipeline_config = config["pipeline_config"]

CONFIG_NAME = pipeline_config["config_name"]
OUTPUT_MODEL_NAME = finetune_config["output_model_name"]


## API Keys

In [20]:
# Could also insert the token here directly
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")

Leveraging Unsloth notebooks for finetuning

In [12]:
max_seq_length = finetune_config["max_seq_length"] # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = finetune_config["load_in_4bit"] # Use 4bit quantization to reduce memory usage. Can be False.


In [13]:
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = finetune_config["base_model_name"], # or choose "unsloth/Llama-3.2-1B-Instruct"
    # model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [15]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset


## Get dataset

In [22]:
if HF_TOKEN is None:
  raise EnvironmentError("A hugggingface token is necessary!")
dataset_finetune = load_dataset(
    FINETUNING_DATASET_NAME,
    CONFIG_NAME,
    split="train",
    token=HF_TOKEN
)

In [23]:
dataset_finetune

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 1259
})

In [24]:

dataset_finetune['question'][0]

"Summarize the concept of agentic flows and their application in generating higher quality data as mentioned in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'."

In [25]:

dataset_finetune['answer'][0]

'Agentic flows refer to a structured approach in generating and transforming text through various specialized agents, each designed to produce specific types of content. This concept is utilized to enhance the quality of synthetic data, which can be used for model customization and continual improvement.\n\nIn the context of the paper "AgentInstruct Toward Generative Teaching With Agentic Flows," agentic flows are applied through different types of generators, such as:\n\n1. **Argument Passage Generator**: Creates passages that articulate arguments, sometimes with logical inconsistencies.\n2. **Debate Passage Generator**: Mimics the structure and content of debate transcripts.\n3. **Conversation Passage Generator**: Generates dialogues.\n4. **Meeting Transcript Generator**: Produces transcripts of meetings.\n5. **Poem Generator**: Creates poetic texts.\n6. **Satirical Passage Generator**: Infuses texts with satire.\n7. **Instructional Passage Generator**: Generates instructional manual

## Convert dataset to messages format

In [26]:
def convert_to_messages_format(example):
    return [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']},
    ]

In [27]:
dataset_finetune = dataset_finetune.map(
    lambda x: {
        'conversations' : convert_to_messages_format(x)
        }
)

Map:   0%|          | 0/1259 [00:00<?, ? examples/s]

In [28]:
dataset_finetune = dataset_finetune.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1259 [00:00<?, ? examples/s]

In [29]:
dataset_finetune['text'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSummarize the concept of agentic flows and their application in generating higher quality data as mentioned in the paper \'AgentInstruct Toward Generative Teaching With Agentic Flows\'.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAgentic flows refer to a structured approach in generating and transforming text through various specialized agents, each designed to produce specific types of content. This concept is utilized to enhance the quality of synthetic data, which can be used for model customization and continual improvement.\n\nIn the context of the paper "AgentInstruct Toward Generative Teaching With Agentic Flows," agentic flows are applied through different types of generators, such as:\n\n1. **Argument Passage Generator**: Creates passages that articulate arguments, sometimes with l

## Set Training Parameters

In [30]:
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
PACKING = True

In [31]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_finetune,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 1,  # Affects memory usage
    packing = finetune_config["packing"], # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = finetune_config["batch_size"], # Affects memory usage
        gradient_accumulation_steps = finetune_config["gradient_accumulation_steps"],
        warmup_steps = 5,
        num_train_epochs = finetune_config["num_train_epochs"], # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none"
    ),
)

Generating train split: 0 examples [00:00, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. Look into this

In [32]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [33]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

' enabling the model to generate answers based on real-time document retrieval.\n\n3. **Chain-of-Thought Reasoning**: RAFT employs a chain-of-thought (CoT) response style, which encourages the model to articulate its reasoning process when generating answers. This contrasts with traditional fine-tuning methods that may focus solely on providing direct answers without elaboration. The CoT approach not only aids in understanding but also helps prevent overfitting by enriching the model\'s comprehension of the task.\n\n4. **Dataset Organization**: The RAFT methodology involves organizing the training dataset such that some portions lack golden documents in their context. This design decision is aimed at enhancing the model\'s ability to operate effectively even when complete information is not available, a challenge that traditional fine-tuning methods may not adequately address.\n\n5. **Performance Across Diverse Datasets**: RAFT has been shown to consistently outperform existing fine-tu

In [34]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                                                                                                                                                                                                    \n\nKnowledge distillation enhances the capabilities of smaller models derived from large language models (LLMs) through several mechanisms, as discussed in the paper "What is the Role of Small Models in the LLM Era: A Survey." Here are the key mechanisms:\n\n1. **Knowledge Transfer**: Knowledge distillation involves training a smaller student model to replicate the behavior of a larger teacher model. This process allows the student model to learn from the rich representations and capabilities of the teacher model, enabling it to achieve performance levels that are comparable to those of larger models despite having few

In [35]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
3.275 GB of memory reserved.


## Train

In [36]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 38 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 380
 "-____-"     Number of trainable parameters = 194,510,848


Step,Training Loss
1,1.6018
2,1.5911
3,1.4578
4,1.1959
5,1.0293
6,1.1146
7,0.8995
8,1.0741
9,1.0571
10,0.9644


In [37]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1332.3681 seconds used for training.
22.21 minutes used for training.
Peak reserved memory = 6.566 GB.
Peak reserved memory for training = 3.291 GB.
Peak reserved memory % of max memory = 16.596 %.
Peak reserved memory for training % of max memory = 8.318 %.


In [38]:
import pandas as pd
import matplotlib.pyplot as plt
training_df = pd.DataFrame(trainer.state.log_history)

In [39]:
training_df.head(2)

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,1.6018,8.078729,4e-05,0.026316,1,,,,,
1,1.5911,8.664534,8e-05,0.052632,2,,,,,


In [40]:
import sys
if not 'google.colab' in sys.modules:
  from helpers import create_training_plots
  fig = create_training_plots(training_df)
  fig.show()
  training_df.to_csv(f"training_logs/{OUTPUT_MODEL_NAME}.csv", index = False)

## Run Inference

In [41]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def get_response(user_query):
    messages = [
    {"role": "user", "content": user_query},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                            temperature = 1.5, min_p = 0.1)
    return tokenizer.batch_decode(outputs)

In [42]:
dataset_finetune['question'][0]

"Summarize the concept of agentic flows and their application in generating higher quality data as mentioned in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'."

Need to investigate how changing the question affects responses

In [43]:
resp = get_response(dataset_finetune['question'][0])
print(resp[0].split("<|start_header_id|>assistant<|end_header_id|>")[1])

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




Agentic flows refer to a series of structured techniques or workflows used in generating or augmenting data, particularly in the context of natural language processing (NLP) and related fields. The concept is centered around the idea of systematically creating higher quality data through various transformative processes. These processes can involve complex reasoning, argumentation,


## Save to HF

In [44]:
print(f"Model dtype: {next(model.parameters()).dtype}")


Model dtype: torch.bfloat16


In [45]:
model.push_to_hub_gguf(
        f"CPSC532/{OUTPUT_MODEL_NAME}",
        tokenizer,
        quantization_method = finetune_config["quantization_methods"],
        token = HF_TOKEN
    )

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 57.71 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 40.76it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m', 'bf16'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/eval_no_sources_finetune into bf16 GGUF format.
The output location will be /content/CPSC532/eval_no_sources_finetune/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: eval_no_sources_finetune
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model par

unsloth.BF16.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/eval_no_sources_finetune
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/eval_no_sources_finetune
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/eval_no_sources_finetune
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/eval_no_sources_finetune


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/eval_no_sources_finetune
