In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from datasets import Dataset

def load_messages_with_system(path, system_content="You are a helpful assistant."):
    examples = []
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            obj = json.loads(line)
            sys_msg = {"role": "system", "content": system_content}
            ua_msgs = obj.get("messages", [])
            examples.append({"messages": [sys_msg] + ua_msgs})
    return examples

examples = load_messages_with_system("train.jsonl")
print(f"Loaded {len(examples)} examples.")
print(examples[0])

Loaded 16 examples.
{'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'What evidence does this study provide regarding the impact of non-traditional credentials, specifically MOOC certificates, on job outcomes, and what further research is suggested to explore these findings?'}, {'role': 'assistant', 'content': "**Evidence Provided Regarding Impact:**\n\nThis study provides several pieces of evidence regarding the impact of non-traditional credentials, particularly MOOC certificates, on job outcomes:\n\n1. **Enhanced Job Outcomes Through Simplified Credential Sharing:** The randomized experiment highlighted that features encouraging and simplifying the sharing of credentials can positively influence learners' job prospects.\n   \n2. **Increased Employment Likelihood via LinkedIn Display:** Learners who showcased their Coursera certificates on LinkedIn reported a significant increase in new employment opportunities, especially for r

In [3]:
dataset = Dataset.from_list(examples)

In [4]:
print(dataset)

Dataset({
    features: ['messages'],
    num_rows: 16
})


## Fine-tuning Synthetic Dataset with Unsloth

In [5]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-13 06:24:09 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-13 06:24:09 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.184 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Add LoRA adapters so we only need to update 1-10% of all parameters!

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Llama-3.2` format for conversation style finetunes. The chat template renders conversations like below: (Cutting Knowledge Date is by default there!)

```
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 May 2025

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is 1+1?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

2<|eot_id|>
```

In [7]:
def formatting_prompts_func(batch):
    convos = batch["messages"]           # list of message-lists
    texts = [
        tokenizer.apply_chat_template(convo,
                                      tokenize=False,
                                      add_generation_prompt=False)
        for convo in convos
    ]
    return {"text": texts}

# 4. Map WITHOUT removing 'messages'
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [8]:
dataset.column_names

['messages', 'text']

In [9]:
dataset[0]

{'messages': [{'content': 'You are a helpful assistant.', 'role': 'system'},
  {'content': 'What evidence does this study provide regarding the impact of non-traditional credentials, specifically MOOC certificates, on job outcomes, and what further research is suggested to explore these findings?',
   'role': 'user'},
  {'content': "**Evidence Provided Regarding Impact:**\n\nThis study provides several pieces of evidence regarding the impact of non-traditional credentials, particularly MOOC certificates, on job outcomes:\n\n1. **Enhanced Job Outcomes Through Simplified Credential Sharing:** The randomized experiment highlighted that features encouraging and simplifying the sharing of credentials can positively influence learners' job prospects.\n   \n2. **Increased Employment Likelihood via LinkedIn Display:** Learners who showcased their Coursera certificates on LinkedIn reported a significant increase in new employment opportunities, especially for roles related to their MOOC credent

## Train the model

In [10]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 2025,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]

Current Memory Statistics

In [11]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A10G. Max memory = 22.184 GB.
1.457 GB of memory reserved.


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16 | Num Epochs = 30 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.061
2,3.13
3,2.8679
4,3.1763
5,2.6315
6,2.7977
7,2.2712
8,2.5725
9,2.3366
10,1.9671


In [13]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

40.2046 seconds used for training.
0.67 minutes used for training.
Peak reserved memory = 2.139 GB.
Peak reserved memory for training = 0.682 GB.
Peak reserved memory % of max memory = 9.642 %.
Peak reserved memory for training % of max memory = 3.074 %.


In [1]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [15]:
model.push_to_hub("tituslhy/retrained_llama32-1bn-finetuned", token = os.environ["HUGGINGFACE_ACCESS_TOKEN"]) # Online saving
tokenizer.push_to_hub("tituslhy/retrained_llama32-1bn-finetuned", token = os.environ["HUGGINGFACE_ACCESS_TOKEN"]) # Online saving

README.md:   0%|          | 0.00/613 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/tituslhy/retrained_llama32-1bn-finetuned


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [2]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "tituslhy/retrained_llama32-1bn-finetuned", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-13 07:09:47 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-13 07:09:47 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.184 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


## Quantization!

In [3]:
# ① Point at the real binary in build/bin
real_q = os.path.expanduser("~/llama.cpp/build/bin/llama-quantize")
assert os.path.exists(real_q), f"{real_q} not found!"

# ② Make a local 'llama.cpp' folder in your notebook working directory
cwd = os.getcwd()
local_pack = os.path.join(cwd, "llama.cpp")
os.makedirs(local_pack, exist_ok=True)

# ③ Symlink it as 'llama-quantize' and also as 'quantize'
for name in ("llama-quantize", "quantize"):
    link = os.path.join(local_pack, name)
    if os.path.exists(link) or os.path.islink(link):
        os.remove(link)
    os.symlink(real_q, link)

# ④ Verify
print("Notebook sees:", os.listdir(local_pack))

Notebook sees: ['.github', 'CODEOWNERS', 'pyproject.toml', 'README.md', 'gguf-py', 'ggml', '.clang-tidy', '.pre-commit-config.yaml', 'examples', 'tests', 'convert_llama_ggml_to_gguf.py', 'cmake', '.gitignore', 'CMakeLists.txt', 'build-xcframework.sh', 'scripts', 'Makefile', 'pocs', 'pyrightconfig.json', 'poetry.lock', 'convert_hf_to_gguf_update.py', 'src', 'docs', 'convert_hf_to_gguf.py', 'mypy.ini', 'llama-quantize', 'CONTRIBUTING.md', 'models', '.git', '.dockerignore', 'AUTHORS', 'requirements.txt', 'licenses', '.clang-format', 'flake.nix', 'prompts', 'tools', '.ecrc', '.flake8', 'grammars', '.devops', 'media', '.editorconfig', 'SECURITY.md', 'LICENSE', 'include', 'requirements', 'flake.lock', 'CMakePresets.json', 'ci', 'build', 'common', '.gitmodules', 'convert_lora_to_gguf.py', 'quantize']


In [5]:
model.push_to_hub_gguf(
    "tituslhy/retrained_llama32-1bn-finetuned", # Change hf to your username!
    tokenizer,
    quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
    token = os.environ["HUGGINGFACE_ACCESS_TOKEN"], # Get a token at https://huggingface.co/settings/tokens
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.92 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 102.51it/s]

Unsloth: Saving tokenizer...




 Done.
Unsloth: Saving tituslhy/retrained_llama32-1bn-finetuned/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at tituslhy/retrained_llama32-1bn-finetuned into bf16 GGUF format.
The output location will be /home/ubuntu/ideal-palm-tree/notebooks/tituslhy/retrained_llama32-1bn-finetuned/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: retrained_llama32-1bn-finetuned
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: l

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/tituslhy/retrained_llama32-1bn-finetuned
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/retrained_llama32-1bn-finetuned
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q5_K_M.gguf:   0%|          | 0.00/912M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/retrained_llama32-1bn-finetuned


Use this to run the ollama model in terminal

In [9]:
# !ollama run hf.co/tituslhy/retrained_llama32-1bn-finetuned:Q4_K_M

# Evaluate

In [1]:
import warnings
import nest_asyncio

nest_asyncio.apply()
warnings.filterwarnings("ignore")

In [2]:
from llama_index.llms.ollama import Ollama
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    LabelledRagDataExample,
    CreatedBy,
)
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

import pandas as pd
import json

embed_model = OllamaEmbedding(model_name="nomic-embed-text")
docs = SimpleDirectoryReader("../data/").load_data(show_progress=True)

def get_rag_dataset_from_csv(csv_path: str):
    converters = {
        "reference_contexts":   lambda s: json.loads(s),
        "query_by":             lambda s: CreatedBy.model_validate_json(s),
        "reference_answer_by":  lambda s: CreatedBy.model_validate_json(s),
    }
    df = pd.read_csv(csv_path, converters=converters)
    examples = []
    for _, row in df.iterrows():
        examples.append(
            LabelledRagDataExample(
                query=row["query"],
                query_by=row["query_by"],                      # now a CreatedBy
                reference_contexts=row["reference_contexts"],   # now a List[str]
                reference_answer=row["reference_answer"],
                reference_answer_by=row["reference_answer_by"], # now a CreatedBy
            )
        )

    # 4. Create the dataset
    dataset = LabelledRagDataset(examples=examples)
    return dataset

holdout_dataset = get_rag_dataset_from_csv("holdout_dataset.csv")

Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.96file/s]


In [3]:
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
query_engine = index.as_query_engine(
    similarity_top_k=6, 
    llm = Ollama("hf.co/tituslhy/retrained_llama32-1bn-finetuned:Q4_K_M")
)

In [4]:
from llama_index.core.llama_pack import download_llama_pack

RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

Processing /home/ubuntu/ideal-palm-tree/notebooks/pack
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-index-packs-rag-evaluator
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): started
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): finished with status 'done'
  Created wheel for llama-index-packs-rag-evaluator: filename=llama_index_packs_rag_evaluator-0.3.0-py3-none-any.whl size=4929 sha256=5e36c5b106ef33696bbfafe9ac35b343b71542a4411152f9f15bbac843b5b52a
  Stored in directory: /tmp/pip-ephem-wheel-cache-_me1glji/wheels/c6/58/33/96c14d77e18e70f8cc69a1c8d3e9d046113d6ec987a7a5fd96
Successfully built llama-index-packs-rag


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [5]:
rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=holdout_dataset,
    judge_llm=Ollama("qwen2.5", request_timeout=120.0), #use the same llm that we use to create the dataset to judge
    embed_model=OllamaEmbedding(model_name="nomic-embed-text")
)

In [7]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
benchmark_df = await rag_evaluator.arun()

Batch processing of evaluations: 100%|██████████| 7/7.0 [02:42<00:00, 23.17s/it]


In [10]:
benchmark_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,2.692308
mean_relevancy_score,0.692308
mean_faithfulness_score,0.307692
mean_context_similarity_score,0.644645


Wow retraining didn't help much :X