## Using Unsloth to finetune

## Install Prerequisite Packages

In [1]:
# This is necessary for colab
!pip install python-dotenv
!pip install datasets
!pip install plotly
!pip install nbformat
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.2 MB/s[0m eta [36m0

Collecting xformers
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.12.1
    Uninstalling trl-0.12.1:
      Successfully uninstalled trl-0.12.1
Successfully installed trl-0.8.6 xformers-0.0.28.post3


## Load `.env`

In [2]:
import os
import sys

from datasets import Dataset

from dotenv import find_dotenv, load_dotenv

load_dotenv()

False

## Important Global Parameters

In [3]:
FINETUNING_DATASET_NAME="CPSC532/arxiv_qa_data"
CONFIG_NAME="2024NOV16_llama_3_1_8b_no_sources_in_question"
OUTPUT_MODEL_NAME="2024NOV16_llama_3_1_8b_no_sources_in_question"
BASE_MODEL_NAME="unsloth/Llama-3.2-3B-Instruct"

## API Keys

In [4]:
# Could also insert the token here directly
# HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")

from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

Leveraging Unsloth notebooks for finetuning

In [5]:
max_seq_length = 16000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [6]:
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_NAME, # or choose "unsloth/Llama-3.2-1B-Instruct"
    # model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [8]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset


## Get dataset

In [9]:
if HF_TOKEN is None:
  raise EnvironmentError("A hugggingface token is necessary!")
dataset_finetune = load_dataset(
    FINETUNING_DATASET_NAME,
    CONFIG_NAME,
    split="train",
    token=HF_TOKEN
)

README.md:   0%|          | 0.00/21.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1755 [00:00<?, ? examples/s]

In [10]:
dataset_finetune

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 1755
})

In [11]:

dataset_finetune['question'][0]

"Why do researchers believe that smaller language models can still outperform larger ones with less training data and smaller model sizes, according to the paper 'Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes'?"

In [12]:

dataset_finetune['answer'][0]

'NO ANSWER FOUND \n\nHowever, I found a similar paper titled "Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes" by Cheng-Yu Hsieh, Chun-Liang Li, CHIH-KUAN YEH, Hootan Nakhost, Yasuhisa Fujii, Alex Jason Ratner, Ranjay Krishna, Chen-Yu Lee, and Tomas Pfister. \n\nAccording to this paper, the researchers believe that smaller language models can still outperform larger ones with less training data and smaller model sizes because of the following reasons:\n\n1. **Knowledge Distillation**: The authors propose a knowledge distillation approach where they train a small student model on a subset of the data used for the large teacher model. This allows the small model to learn from the large model\'s knowledge without requiring as much data or computational resources.\n\n2. **Efficient Training**: By using a smaller model size, the training process becomes more efficient, and the model can be trained faster with less data.\n\n3. **R

In [13]:
FILTER_OUT_STRINGS = ['no answer found', 'no text provided', 'no information provided']
df = dataset_finetune.to_pandas()
for string in FILTER_OUT_STRINGS:
  df = df.loc[~df.answer.str.lower().str.contains(string)].reset_index(drop=True)
dataset_finetune = Dataset.from_pandas(df)

In [14]:
dataset_finetune

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 1619
})

## Convert dataset to messages format

In [15]:
def convert_to_messages_format(example):
    return [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']},
    ]

In [16]:
dataset_finetune = dataset_finetune.map(
    lambda x: {
        'conversations' : convert_to_messages_format(x)
        }
)

Map:   0%|          | 0/1619 [00:00<?, ? examples/s]

In [17]:
dataset_finetune = dataset_finetune.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1619 [00:00<?, ? examples/s]

In [18]:
dataset_finetune['text'][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are some potential applications or use cases where smaller language models can be more effective than larger ones, according to the paper's findings?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAccording to the paper's findings, there are several potential applications or use cases where smaller language models can be more effective than larger ones:\n\n1. **Efficient Training**: Smaller models require less computational resources and can be trained faster with less data, making them suitable for tasks that require rapid deployment or have limited training data.\n\n2. **Regularization Effect**: The smaller model size acts as a regularization effect, preventing overfitting and allowing the model to generalize better on unseen data.\n\n3. **Improved Generalizability**: Smaller models can

## Set Training Parameters

In [19]:
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
PACKING = False

In [20]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_finetune,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 1,  # Affects memory usage
    packing = PACKING, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = BATCH_SIZE, # Affects memory usage
        gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
        warmup_steps = 5,
        num_train_epochs = 10, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none"
    ),
)

Map:   0%|          | 0/1619 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. Look into this

In [21]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/1619 [00:00<?, ? examples/s]

In [22]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are some potential applications or use cases where smaller language models can be more effective than larger ones, according to the paper's findings?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAccording to the paper's findings, there are several potential applications or use cases where smaller language models can be more effective than larger ones:\n\n1. **Efficient Training**: Smaller models require less computational resources and can be trained faster with less data, making them suitable for tasks that require rapid deployment or have limited training data.\n\n2. **Regularization Effect**: The smaller model size acts as a regularization effect, preventing overfitting and allowing the model to generalize better on unseen data.\n\n3. **Improved Generalizability**: Smaller models can

In [23]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                       \n\nTransfer learning is an approach for training small language models by leveraging pre-trained models as a starting point. This technique involves using the knowledge and features learned from a larger, pre-trained model to fine-tune a smaller model on a specific task.\n\nThe process of transfer learning typically involves several steps:\n\n1.  **Pre-training**: A large, pre-trained model is trained on a diverse dataset to learn general language representations.\n2.  **Fine-tuning**: The pre-trained model is then fine-tuned on a smaller dataset related to the specific task at hand.\n3.  **Adaptation**: The fine-tuned model is adapted to the specific requirements of the task, such as adjusting the number of parameters or modifying the architecture.\n\nTransfer learning can be applied in various ways, including:\n\n*   **Knowledge distillation**: A smaller model learns to mimic the behavior of a larger pre-trained model.\n*   **M

In [24]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
3.275 GB of memory reserved.


## Train

In [25]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,619 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,020
 "-____-"     Number of trainable parameters = 194,510,848


Step,Training Loss
1,1.8286
2,1.8038
3,1.7371
4,1.7874
5,1.3972
6,1.3055
7,1.7868
8,1.517
9,1.7687
10,1.6725


In [26]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3392.1773 seconds used for training.
56.54 minutes used for training.
Peak reserved memory = 29.916 GB.
Peak reserved memory for training = 26.641 GB.
Peak reserved memory % of max memory = 75.614 %.
Peak reserved memory for training % of max memory = 67.336 %.


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
training_df = pd.DataFrame(trainer.state.log_history)

In [33]:
training_df.to_csv(f"{CONFIG_NAME}.csv")

In [None]:
training_df.head(2)

In [None]:
import sys
if not 'google.colab' in sys.modules:
  from helpers import create_training_plots
  fig = create_training_plots(training_df)
  fig.show()
  training_df.to_csv(f"training_logs/{OUTPUT_MODEL_NAME}.csv", index = False)

## Run Inference

In [28]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def get_response(user_query):
    messages = [
    {"role": "user", "content": user_query},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                            temperature = 1.5, min_p = 0.1)
    return tokenizer.batch_decode(outputs)

In [29]:
dataset_finetune['question'][0]

"What are some potential applications or use cases where smaller language models can be more effective than larger ones, according to the paper's findings?"

Need to investigate how changing the question affects responses

In [30]:
resp = get_response(dataset_finetune['question'][0])
print(resp[0].split("<|start_header_id|>assistant<|end_header_id|>")[1])

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




According to the paper's findings, there are several potential applications or use cases where smaller language models can be more effective than larger ones:

1. **Efficient Training**: Smaller models require less computational resources and can be trained faster with less data, making them suitable for tasks that require rapid deployment or have limited training data


## Save to HF

In [31]:
print(f"Model dtype: {next(model.parameters()).dtype}")


Model dtype: torch.bfloat16


In [32]:
model.push_to_hub_gguf(
        f"CPSC532/{OUTPUT_MODEL_NAME}",
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m", "not_quantized", "quantized", "f16", "q4_0"],
        token = HF_TOKEN
    )

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 53.14 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:01<00:00, 27.57it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m', 'bf16', 'q4_k_m', 'f16', 'q4_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question into f16 GGUF format.
The output location will be /content/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: 2024NOV16_llama_3_1_8b_no_sources_in_question
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model 

unsloth.F16.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.BF16.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_0.gguf:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/2024NOV16_llama_3_1_8b_no_sources_in_question
