# Finetuning

1. Upload an eval_config file into this directory, save it as `config.json`
2. Add a `.env` file, the contents should be:
```
export HUGGINGFACE_API_KEY=xxxx
```

where `xxxx` is replaced with the key


## Install Prerequisite Packages

In [3]:
# This is necessary for colab
!pip install python-dotenv
!pip install datasets
!pip install plotly
!pip install nbformat
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0

Collecting xformers
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.12.1
    Uninstalling trl-0.12.1:
      Successfully uninstalled trl-0.12.1
Successfully installed trl-0.8.6 xformers-0.0.28.post3


## Imports, Load `.env`

In [4]:
import os
import sys
import json
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

from datasets import Dataset
from datasets import load_dataset

from dotenv import find_dotenv, load_dotenv

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

# The file .env should contain the line (where xxxxxxx is replaced):
# export HUGGINGFACE_API_KEY=xxxxxxx
load_dotenv()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


True



## Load Config

In [5]:
FINETUNING_DATASET_NAME="CPSC532/arxiv_qa_data"
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
PACKING = True

with open('config.json') as f:
    config = json.load(f)
finetune_config_template = config["template"]["finetune_config"]

## API Keys

In [6]:
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
if HF_TOKEN is None:
  raise EnvironmentError("A hugggingface token is necessary!")

## Helper Functions

In [10]:
def convert_to_messages_format(example):
    return [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']},
    ]

## Finetuning Function

In [11]:
def finetune(pipeline_config_name, finetune_config):
  max_seq_length = finetune_config["max_seq_length"] # Choose any! We auto support RoPE Scaling internally!
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
  load_in_4bit = finetune_config["load_in_4bit"] # Use 4bit quantization to reduce memory usage. Can be False.
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = finetune_config["base_model_name"], # or choose "unsloth/Llama-3.2-1B-Instruct"
      # model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  model = FastLanguageModel.get_peft_model(
      model,
      r = finetune_config["r"], # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
      target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
      lora_alpha = 16,
      lora_dropout = 0, # Supports any, but = 0 is optimized
      bias = "none",    # Supports any, but = "none" is optimized
      # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
      use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
      random_state = 3407,
      use_rslora = True,  # We support rank stabilized LoRA
      loftq_config = None, # And LoftQ
  )
  tokenizer = get_chat_template(
      tokenizer,
      chat_template = "llama-3.1",
  )
  dataset_finetune = load_dataset(
      FINETUNING_DATASET_NAME,
      pipeline_config_name,
      split="train",
      token=HF_TOKEN
  )
  print(dataset_finetune)
  print(dataset_finetune['question'][0])
  print(dataset_finetune['answer'][0])
  dataset_finetune = dataset_finetune.map(
      lambda x: {
          'conversations' : convert_to_messages_format(x)
          }
  )
  def formatting_prompts_func(examples):
      convos = examples["conversations"]
      texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
      return { "text" : texts, }
  dataset_finetune = dataset_finetune.map(formatting_prompts_func, batched = True)
  print(dataset_finetune['text'][0])
  trainer = SFTTrainer(
      model = model,
      tokenizer = tokenizer,
      train_dataset = dataset_finetune,
      dataset_text_field = "text",
      max_seq_length = max_seq_length,
      data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
      dataset_num_proc = 1,  # Affects memory usage
      packing = finetune_config["packing"], # Can make training 5x faster for short sequences.
      args = TrainingArguments(
          per_device_train_batch_size = finetune_config["batch_size"], # Affects memory usage
          gradient_accumulation_steps = finetune_config["gradient_accumulation_steps"],
          warmup_steps = 5,
          num_train_epochs = finetune_config["num_train_epochs"], # Set this for 1 full training run.
          # max_steps = 60,
          learning_rate = 2e-4,
          fp16 = not is_bfloat16_supported(),
          bf16 = is_bfloat16_supported(),
          logging_steps = 1,
          optim = "adamw_8bit",
          weight_decay = 0.01,
          lr_scheduler_type = "linear",
          seed = 3407,
          output_dir = "outputs",
          report_to = "none"
      ),
  )
  trainer = train_on_responses_only(
      trainer,
      instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
      response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
  )
  print(tokenizer.decode(trainer.train_dataset[0]["input_ids"]))
  space = tokenizer(" ", add_special_tokens = False).input_ids[0]
  print(tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]))
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")
  trainer_stats = trainer.train()
  used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
  used_percentage = round(used_memory         /max_memory*100, 3)
  lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
  print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
  print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
  print(f"Peak reserved memory = {used_memory} GB.")
  print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
  print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
  output_model_name = finetune_config["output_model_name"]
  model.push_to_hub_gguf(
          f"CPSC532/{output_model_name}",
          tokenizer,
          quantization_method = finetune_config["quantization_methods"],
          token = HF_TOKEN
      )

## Run Finetuning

In [None]:
for include_source in config["matrix"]["pipeline_config.sources"]:
    for llm_config in config["matrix"]["pipeline_config.llm"]:
        pipeline_config_name = (
            f"src_{include_source}_llm_{llm_config['model']}".replace(
                "/", "-"
            )
        )
        for r in config["matrix"]["finetune_config.r"]:
          output_model_name = f"{pipeline_config_name}_r{r}"
          finetune_config = {
              **finetune_config_template,
              "r" : r,
              "output_model_name" : output_model_name
          }
          finetune(pipeline_config_name, finetune_config)


==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 750
})
What is the scoring calculation method used in the EQBench benchmark, as mentioned in 'AgentInstruct Toward Generative Teaching With Agentic Flows'?
According to the provided text, the scoring calculation method used in the EQBench benchmark is as follows:

"The metric scores are generated using both the version 1 and 2 implementations described in the EQBench paper and the creators’ github reposito

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the scoring calculation method used in the EQBench benchmark, as mentioned in 'AgentInstruct Toward Generative Teaching With Agentic Flows'?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

According to the provided text, the scoring calculation method used in the EQBench benchmark is as follows:

"The metric scores are generated using both the version 1 and 2 implementations described in the EQBench paper and the creators’ github repository. The scoring calculation is calibrated such that a score of 0 corresponds to answering randomly, and a 100 would denote perfect alignment with the reference answer."

This indicates that the EQBench benchmark uses a scoring system that ranges from 0 to 100, where 0 represents random or incorrect answers and 100 represents perfect alignment with the reference a

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are the potential biases in the fine-tuning process due to the imbalance in knowledge levels across different relations, as observed in 'Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations'?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The potential biases in the fine-tuning process due to the imbalance in knowledge levels across different relations are:

1. **Knowledge level imbalance**: The table shows that different relations have varying knowledge levels, with some relations having a higher proportion of "HighlyKnown" examples and others having a higher proportion of "Unknown" examples. This imbalance may affect the fine-tuning process and the ability of the model to generalize to new knowledge.
2. **Overfitting to known knowledge**: The model may overfit to the known knowledge in the fine-tuning 

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 190
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,1.6437
2,1.5476
3,1.8039
4,1.3224
5,0.9995
6,1.1444
7,0.9591
8,1.1432
9,0.9067
10,0.8604


656.1337 seconds used for training.
10.94 minutes used for training.
Peak reserved memory = 7.539 GB.
Peak reserved memory for training = 2.269 GB.
Peak reserved memory % of max memory = 19.055 %.
Peak reserved memory for training % of max memory = 5.735 %.


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 57.72 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 34.44it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r16 into bf16 GGUF format.
The output location will be /content/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r16/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r16
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.in

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r16


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r16
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 750
})
What is the scoring calculation method used in the EQBench benchmark, as mentioned in 'AgentInstruct Toward Generative Teaching With Agentic Flows'?
According to the provided text, the scoring calculation method used in the EQBench benchmark is as follows:

"The metric scores are generated using both the version 1 and 2 implementations described in the EQBench paper and the creators’ github repository. The scoring calculation is calibrated such that a score of 0 corresponds to answering randomly, and a 100 would denote perfect alignment with the reference answer."

This indicates that the EQBench benchmark uses a scoring system that ranges from 0 to 100, where 0 represents random or incorrect answers and 100 represents perfect alignment with the reference answer.

Source:
The text does not explicitly mention the paper "AgentInstruct Toward Generative Teachi

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 190
 "-____-"     Number of trainable parameters = 194,510,848


Step,Training Loss
1,1.6437
2,1.5476
3,1.5382
4,1.1755
5,0.8887
6,1.0191
7,0.9012
8,1.0986
9,0.8792
10,0.8096


655.1392 seconds used for training.
10.92 minutes used for training.
Peak reserved memory = 11.414 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 28.849 %.
Peak reserved memory for training % of max memory = 0.0 %.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.95 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 94.13it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r128 into bf16 GGUF format.
The output location will be /content/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r128/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r128
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,         

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r128


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/src_True_llm_meta-llama-Llama-3.1-8B-Instruct_r128
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1233 [00:00<?, ? examples/s]

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 1233
})
Summarize the process of text translation and its goal as outlined in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'.
The process of text translation, as outlined in the provided text, involves converting text from one language to another while striving to preserve the original meaning as closely as possible. The primary goal of text translation is to ensure that the translated text conveys the same message, tone, and context as the source text, allowing for effective communication across different languages. This process is crucial for making information accessible to a wider audience and facilitating understanding among speakers of different languages.

In summary, text translation aims to maintain the integrity of the original content while adapting it to a new linguistic context, ensuring that the essence and intent of the mess

Map:   0%|          | 0/1233 [00:00<?, ? examples/s]

Map:   0%|          | 0/1233 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize the process of text translation and its goal as outlined in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The process of text translation, as outlined in the provided text, involves converting text from one language to another while striving to preserve the original meaning as closely as possible. The primary goal of text translation is to ensure that the translated text conveys the same message, tone, and context as the source text, allowing for effective communication across different languages. This process is crucial for making information accessible to a wider audience and facilitating understanding among speakers of different languages.

In summary, text translation aims to maintain the integrity of the original co

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

ITE: ENABLING LLMS TO GENERATE FINEGRAINED CITATIONS IN LONG-CONTEXT QA'.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The differences in how LongSFT-9B and LongCite-9B handle the query about Duke Energy and Affiliated Managers Group's locations are notable in their approaches to information retrieval and citation.

1. **Information Accuracy**:
   - **LongSFT-9B**: This model inaccurately states that both Duke Energy and Affiliated Managers Group have offices in Massachusetts. It does this by copying the office locations of Affiliated Managers Group and applying them to Duke Energy, leading to a hallucination regarding Duke Energy's location.
   - **LongCite-9B**: In contrast, LongCite-9B accurately identifies that Duke Energy is headquartered in Charlotte, North Carolina, while Affiliated Managers Group has an office in Prides Crossing, Massachusetts. This model correctly distinguishes between the two companies' locations.

2. **Citations**:
   - **LongSFT-9B**: The respons

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 37 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 370
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,1.6053
2,1.7438
3,1.6777
4,1.4373
5,1.1071
6,1.1704
7,1.2466
8,1.0617
9,1.0606
10,1.1243


1259.7695 seconds used for training.
21.0 minutes used for training.
Peak reserved memory = 12.297 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 31.081 %.
Peak reserved memory for training % of max memory = 0.0 %.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.71 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 102.17it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/src_True_llm_gpt-4o-mini_r16 into bf16 GGUF format.
The output location will be /content/CPSC532/src_True_llm_gpt-4o-mini_r16/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: src_True_llm_gpt-4o-mini_r16
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loadin

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/src_True_llm_gpt-4o-mini_r16


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/src_True_llm_gpt-4o-mini_r16
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 1233
})
Summarize the process of text translation and its goal as outlined in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'.
The process of text translation, as outlined in the provided text, involves converting text from one language to another while striving to preserve the original meaning as closely as possible. The primary goal of text translation is to ensure that the translated text conveys the same message, tone, and context as the source text, allowing for effective communication across different languages. This process is crucial for making information accessible to a wider audience and facilitating understanding among speakers of different languages.

In summary, text translation aims to maintain the integrity of the original content while adapting it to a new linguistic context, ensuring that the essence and intent of the mess

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 37 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 370
 "-____-"     Number of trainable parameters = 194,510,848


Step,Training Loss
1,1.6053
2,1.7438
3,1.5174
4,1.3202
5,1.0021
6,1.0724
7,1.1508
8,1.0178
9,0.9895
10,1.073


1276.812 seconds used for training.
21.28 minutes used for training.
Peak reserved memory = 12.297 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 31.081 %.
Peak reserved memory for training % of max memory = 0.0 %.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.72 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 94.36it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/src_True_llm_gpt-4o-mini_r128 into bf16 GGUF format.
The output location will be /content/CPSC532/src_True_llm_gpt-4o-mini_r128/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: src_True_llm_gpt-4o-mini_r128
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loa

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/src_True_llm_gpt-4o-mini_r128


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/src_True_llm_gpt-4o-mini_r128
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/923 [00:00<?, ? examples/s]

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 923
})
What are the 18 types of text modifications mentioned in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'?
The paper does not mention "AgentInstruct Toward Generative Teaching With Agentic Flows" as the source of the text modifications. However, the text provided lists 18 types of text modifications:

1. Text Compression
2. Text Simplification
3. Text Expansion
4. Text Translation
5. Text Formatting
6. Sentiment Modification
7. Text Annotation
8. Keyword Replacement
9. Text Removing
10. Text Capitalization
11. Text Styling
12. Content Rewriting
13. Data Normalization
14. Plagiarism Rewording
15. Code Switching
16. Text Obfuscation
17. Textual Entailment
18. Rewriting with vocabulary limitations

These types of text modifications are mentioned in the provided text, but not in the paper "AgentInstruct Toward Generative Teaching With Age

Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/923 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are the 18 types of text modifications mentioned in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The paper does not mention "AgentInstruct Toward Generative Teaching With Agentic Flows" as the source of the text modifications. However, the text provided lists 18 types of text modifications:

1. Text Compression
2. Text Simplification
3. Text Expansion
4. Text Translation
5. Text Formatting
6. Sentiment Modification
7. Text Annotation
8. Keyword Replacement
9. Text Removing
10. Text Capitalization
11. Text Styling
12. Content Rewriting
13. Data Normalization
14. Plagiarism Rewording
15. Code Switching
16. Text Obfuscation
17. Textual Entailment
18. Rewriting with vocabulary limitations

These types of text modifications are m

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

 mins |
| Llama 2 Chat 70B | 7 GB | 65 GB | 40 mins |

Note that the PEFT method used in each configuration is LORA for the 7B and 13B models, and QLORA for the 70B model.

Source:
Table 2: Maximum possible PEFT configurations of Llama 2 models on A100 80 GB<|eot_id|><|eot_id|><|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are the key differences between Quantization (QAT) and Post-Training Quantization (PTQ) schemes?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The key differences between Quantization (QAT) and Post-Training Quantization (PTQ) schemes are:

1.  **Quantization (QAT)**: This is a two-step process that involves finding the normalization constant and scaling the vector into the target range, followed by rounding off to the nearest value in the target range. During matrix multiplication of tensors, quantization o

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 28 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 280
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,1.6059
2,4.8688
3,0.7604
4,1.5469
5,1.2666
6,0.9806
7,0.9452
8,0.8882
9,0.8817
10,0.8978


943.7758 seconds used for training.
15.73 minutes used for training.
Peak reserved memory = 12.297 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 31.081 %.
Peak reserved memory for training % of max memory = 0.0 %.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.62 out of 83.48 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 103.80it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at CPSC532/src_False_llm_meta-llama-Llama-3.1-8B-Instruct_r16 into bf16 GGUF format.
The output location will be /content/CPSC532/src_False_llm_meta-llama-Llama-3.1-8B-Instruct_r16/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: src_False_llm_meta-llama-Llama-3.1-8B-Instruct_r16
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,         

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/CPSC532/src_False_llm_meta-llama-Llama-3.1-8B-Instruct_r16


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/CPSC532/src_False_llm_meta-llama-Llama-3.1-8B-Instruct_r16
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer', 'pass_through'],
    num_rows: 923
})
What are the 18 types of text modifications mentioned in the paper 'AgentInstruct Toward Generative Teaching With Agentic Flows'?
The paper does not mention "AgentInstruct Toward Generative Teaching With Agentic Flows" as the source of the text modifications. However, the text provided lists 18 types of text modifications:

1. Text Compression
2. Text Simplification
3. Text Expansion
4. Text Translation
5. Text Formatting
6. Sentiment Modification
7. Text Annotation
8. Keyword Replacement
9. Text Removing
10. Text Capitalization
11. Text Styling
12. Content Rewriting
13. Data Normalization
14. Plagiarism Rewording
15. Code Switching
16. Text Obfuscation
17. Textual Entailment
18. Rewriting with vocabulary limitations

These types of text modifications are mentioned in the provided text, but not in the paper "AgentInstruct Toward Generative Teaching With Age

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 28 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 280
 "-____-"     Number of trainable parameters = 194,510,848


Step,Training Loss
1,1.6059
2,4.8688
3,0.7396
4,1.351
5,1.1584
6,0.8829
7,0.8829
8,0.8442
9,0.8466
10,0.8455


In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# training_df = pd.DataFrame(trainer.state.log_history)

In [None]:
# training_df.head(2)

In [None]:
import sys
if not 'google.colab' in sys.modules:
  from helpers import create_training_plots
  fig = create_training_plots(training_df)
  fig.show()
  training_df.to_csv(f"training_logs/{OUTPUT_MODEL_NAME}.csv", index = False)

## Run Inference

In [None]:
# from unsloth.chat_templates import get_chat_template

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3.1",
# )
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# def get_response(user_query):
#     messages = [
#     {"role": "user", "content": user_query},
#     ]
#     inputs = tokenizer.apply_chat_template(
#         messages,
#         tokenize = True,
#         add_generation_prompt = True, # Must add for generation
#         return_tensors = "pt",
#     ).to("cuda")

#     outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
#                             temperature = 1.5, min_p = 0.1)
#     return tokenizer.batch_decode(outputs)

In [None]:
# dataset_finetune['question'][0]

Need to investigate how changing the question affects responses

In [None]:
# resp = get_response(dataset_finetune['question'][0])
# print(resp[0].split("<|start_header_id|>assistant<|end_header_id|>")[1])