In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

raw_ds = load_dataset(
    "json",
    data_files="raft_train.jsonl",
    split="train"
)

# Optionally: split into train/test
splits = raw_ds.train_test_split(test_size=0.1)
train_ds = splits["train"]
eval_ds  = splits["test"]

In [3]:
import pandas as pd

df = pd.DataFrame(raw_ds)
df.head()

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,seed_task_0,general,How many learners participated in the study?,{'sentences': [['Table 4: Local Average Treatm...,The value of non-traditional credentials in th...,assistant: To determine how many learners part...,<DOCUMENT>Table 4: Local Average Treatment Eff...
1,seed_task_1,general,What platforms were used to deliver the courses?,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: Step 1: Identify relevant informati...,<DOCUMENT>The value of non-traditional credent...
2,seed_task_2,general,What was the main finding regarding credential...,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: ### Step-by-Step Reasoning:\n1. **I...,<DOCUMENT>The value of non-traditional credent...
3,seed_task_3,general,Did the intervention have a significant impact...,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: Step-by-step reasoning:\n\n1. The q...,<DOCUMENT>The value of non-traditional credent...
4,seed_task_4,general,How did the effect vary among different groups...,{'sentences': [['13 p.p.) and 36 p.p. (S.E. 12...,The value of non-traditional credentials in th...,assistant: Step-by-step reasoning:\n\n1. The s...,<DOCUMENT>13 p.p.) and 36 p.p. (S.E. 12 p.p.) ...


In [4]:
train_ds, eval_ds

(Dataset({
     features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction'],
     num_rows: 296
 }),
 Dataset({
     features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction'],
     num_rows: 33
 }))

In [5]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-20 06:21:45 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-20 06:21:45 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.184 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 2025,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


## Formatting the prompts
We need to put everything together into a single 'text' field for the LLM to be trained on

In [7]:
def formatting_prompts_func(examples):
    """Define a formatter that injects the retrieved context:"""
    
    texts = []
    for ctx, oracle, instr, ans in zip(
        examples["context"],
        examples["oracle_context"],
        examples["instruction"],
        examples["cot_answer"]
    ):
        # You can choose to use `oracle_context` (gold) vs. `context` (retrieved)
        # Here we show both, but you could just use `context`.
        prompt = (
            "### Retrieved Passages:\n"
            f"{ctx}\n\n"
            "### (Oracle Passages):\n"
            f"{oracle}\n\n"
            "### Question:\n"
            f"{instr}\n\n"
            "### Answer:\n"
        )
        # Append the gold answer plus EOS
        texts.append(prompt + ans + tokenizer.eos_token)
    return {"text": texts}

# then:
train_ds = train_ds.map(formatting_prompts_func, batched=True)
eval_ds = eval_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Here's an example of the text we are going to train our LLM on!

In [8]:
from IPython.display import display, Markdown

display(Markdown(pd.DataFrame(train_ds).head()['text'].iloc[0]))

### Retrieved Passages:
{'sentences': [['• If the time difference is greater than or equal to 0 months, it implies a recent career devel-\nopment, leading to further analysis:\n– If “Intern” is found in the current job title, the outcome is classified as a new internship.\n– If the employer of the current job differs from the employer of the previous job, it\nindicates a change in job roles, leading to the classification of new job.\n', 'The value of non-traditional credentials in the labor market*\nSusan Athey & Emil Palikot\nMay 2, 2024\nAbstract\nThis study investigates the labor market value of credentials obtained from Massive Open On-\nline Courses (MOOCs) and shared on business networking platforms. We conducted a random-\nized experiment involving more than 800,000 learners, primarily from developing countries and\nwithout college degrees, who completed technology or business-related courses on the Coursera\nplatform between September 2022 and March 2023. The intervention targeted learners who had\nrecently completed their courses, encouraging them to share their credentials and simplifying the\nsharing process. One year after the intervention, we collected data from LinkedIn profiles of ap-\nproximately 40,000 experimental subjects. We find that the intervention leads to an increase of 17\npercentage points for credential sharing. Further, learners in the treatment group were 6% more\nlikely to report new employment within a year, with an 8% increase in jobs related to their certifi-\ncates. This effect was more pronounced among LinkedIn users with lower baseline employability.\nAcross the entire sample, the treated group received a higher number of certificate views, indicat-\ning an increased interest in their profiles. These results suggest that facilitating credential sharing\nand reminding learners of the value of skill signaling can yield significant gains. When the ex-\nperiment is viewed as an encouragement design for credential sharing, we can estimate the local\naverage treatment effect (LATE) of credential sharing (that is, the impact of credential sharing on\nthe workers induced to share by the intervention) for the outcome of getting a job. The LATE esti-\nmates are imprecise but large in magnitude; they suggest that credential sharing more than doubles\nthe baseline probability of getting a new job in scope for the credential.\n*We thank Eric Karsten and his team in Coursera for collaborating on this project. ', '(2018). Can mooc programs improve student employment\nprospects? Available at SSRN 3260695.\nHansen, J. D. and Reich, J. (2015). Democratizing education? examining access and usage patterns in\nmassive open online courses. Science, 350(6265):1245–1248.\nHussey, A. (2012). Human capital augmentation versus the signaling value of mba education. Eco-\nnomics of Education Review, 31(4):442–451.\n19']], 'title': [['placeholder_title', 'placeholder_title', 'placeholder_title']]}

### (Oracle Passages):
• If the time difference is greater than or equal to 0 months, it implies a recent career devel-
opment, leading to further analysis:
– If “Intern” is found in the current job title, the outcome is classified as a new internship.
– If the employer of the current job differs from the employer of the previous job, it
indicates a change in job roles, leading to the classification of new job.


### Question:
<DOCUMENT>• If the time difference is greater than or equal to 0 months, it implies a recent career devel-
opment, leading to further analysis:
– If “Intern” is found in the current job title, the outcome is classified as a new internship.
– If the employer of the current job differs from the employer of the previous job, it
indicates a change in job roles, leading to the classification of new job.
</DOCUMENT>
<DOCUMENT>The value of non-traditional credentials in the labor market*
Susan Athey & Emil Palikot
May 2, 2024
Abstract
This study investigates the labor market value of credentials obtained from Massive Open On-
line Courses (MOOCs) and shared on business networking platforms. We conducted a random-
ized experiment involving more than 800,000 learners, primarily from developing countries and
without college degrees, who completed technology or business-related courses on the Coursera
platform between September 2022 and March 2023. The intervention targeted learners who had
recently completed their courses, encouraging them to share their credentials and simplifying the
sharing process. One year after the intervention, we collected data from LinkedIn profiles of ap-
proximately 40,000 experimental subjects. We find that the intervention leads to an increase of 17
percentage points for credential sharing. Further, learners in the treatment group were 6% more
likely to report new employment within a year, with an 8% increase in jobs related to their certifi-
cates. This effect was more pronounced among LinkedIn users with lower baseline employability.
Across the entire sample, the treated group received a higher number of certificate views, indicat-
ing an increased interest in their profiles. These results suggest that facilitating credential sharing
and reminding learners of the value of skill signaling can yield significant gains. When the ex-
periment is viewed as an encouragement design for credential sharing, we can estimate the local
average treatment effect (LATE) of credential sharing (that is, the impact of credential sharing on
the workers induced to share by the intervention) for the outcome of getting a job. The LATE esti-
mates are imprecise but large in magnitude; they suggest that credential sharing more than doubles
the baseline probability of getting a new job in scope for the credential.
*We thank Eric Karsten and his team in Coursera for collaborating on this project. </DOCUMENT>
<DOCUMENT>(2018). Can mooc programs improve student employment
prospects? Available at SSRN 3260695.
Hansen, J. D. and Reich, J. (2015). Democratizing education? examining access and usage patterns in
massive open online courses. Science, 350(6265):1245–1248.
Hussey, A. (2012). Human capital augmentation versus the signaling value of mba education. Eco-
nomics of Education Review, 31(4):442–451.
19</DOCUMENT>
What does a time difference greater than or equal to 0 months indicate?

### Answer:
assistant: Step-by-step reasoning:
1. The question asks about what a time difference greater than or equal to 0 months indicates.
2. According to the given context, this condition (time difference ≥ 0 months) implies a recent career development. ##begin_quote##If the time difference is greater than or equal to 0 months, it implies a recent career development, leading to further analysis:##end_quote##
3. The context then provides two specific scenarios for such cases:
   - If "Intern" is found in the current job title, it's classified as a new internship.
   - If the employer of the current job differs from that of the previous job, it indicates a change in job roles and is thus classified as a new job.

Final answer: <ANSWER>: A time difference greater than or equal to 0 months indicates recent career development, which could be either a new internship (if "Intern" is in the current job title) or a new job due to a change in employer.<|eot_id|>

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="llama32_1bn_raft", #This will also be used as your huggingfacehub model id name
    report_to="wandb", #Leave this to be blank if you don't want to use wandb
    run_name="RAFT_SFT_Take6",
    eval_steps=5,
    eval_strategy="steps",
    per_device_train_batch_size=1,    # small batches if quantized
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=5,
    # max_steps=60,                    # or set num_train_epochs
    save_strategy="no",
    bf16=True,
    fp16=False,
    gradient_checkpointing=True,
    logging_strategy="steps",
    logging_steps=5,
    seed=42,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    eval_dataset = eval_ds, 
    args=training_args,
    dataset_text_field="text",
    
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/296 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/33 [00:00<?, ? examples/s]

Current memory statistics

In [10]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A10G. Max memory = 22.184 GB.
1.457 GB of memory reserved.


In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 296 | Num Epochs = 5 | Total steps = 185
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)
[34m[1mwandb[0m: Currently logged in as: [33mtituslhy[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
5,1.5962,1.565817
10,1.5708,1.551308
15,1.5847,1.529841
20,1.6046,1.50604
25,1.4844,1.481573
30,1.4919,1.458403
35,1.4561,1.435972
40,1.4375,1.414349
45,1.3756,1.392648
50,1.4543,1.372671


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Used memory statistics

In [12]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

617.3861 seconds used for training.
10.29 minutes used for training.
Peak reserved memory = 2.156 GB.
Peak reserved memory for training = 0.699 GB.
Peak reserved memory % of max memory = 9.719 %.
Peak reserved memory for training % of max memory = 3.151 %.


In [13]:
model.save_pretrained_merged(
    save_directory = "llama32_1bn_raft_merged",     # Local path to store merged model
    tokenizer = tokenizer,
    save_method = "merged_16bit",        # Can also use "merged_4bit" or "merged_8bit" if needed
)

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.33 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 67.37it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving llama32_1bn_raft_merged/pytorch_model.bin...
Done.


In [14]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [17]:
model.push_to_hub_merged(
    repo_id="tituslhy/llama32_1bn_raft_non_traditional_credentials",
    tokenizer=tokenizer,
    save_method="merged_16bit",
    token=os.environ["HUGGINGFACE_ACCESS_TOKEN"]
)

Unsloth: You are pushing to hub, but you passed your HF username = tituslhy.
We shall truncate tituslhy/llama32_1bn_raft_non_traditional_credentials to llama32_1bn_raft_non_traditional_credentials


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.17 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 101.20it/s]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving llama32_1bn_raft_non_traditional_credentials/pytorch_model.bin...


README.md:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/tituslhy/llama32_1bn_raft_non_traditional_credentials


You don't have to run this cell if your llama.cpp gguf execution (`model.push_to_hub_gguf`) works! 
> I had to because it could not find where the llama-quantize binary was.

In [18]:
# ① Point at the real binary in build/bin
real_q = os.path.expanduser("~/llama.cpp/build/bin/llama-quantize")
assert os.path.exists(real_q), f"{real_q} not found!"

# ② Make a local 'llama.cpp' folder in your notebook working directory
cwd = os.getcwd()
local_pack = os.path.join(cwd, "llama.cpp")
os.makedirs(local_pack, exist_ok=True)

# ③ Symlink it as 'llama-quantize' and also as 'quantize'
for name in ("llama-quantize", "quantize"):
    link = os.path.join(local_pack, name)
    if os.path.exists(link) or os.path.islink(link):
        os.remove(link)
    os.symlink(real_q, link)

# ④ Verify
print("Notebook sees:", os.listdir(local_pack))

Notebook sees: ['.github', 'CODEOWNERS', 'pyproject.toml', 'README.md', 'gguf-py', 'ggml', '.clang-tidy', '.pre-commit-config.yaml', 'examples', 'tests', 'convert_llama_ggml_to_gguf.py', 'cmake', '.gitignore', 'CMakeLists.txt', 'build-xcframework.sh', 'scripts', 'Makefile', 'pocs', 'pyrightconfig.json', 'poetry.lock', 'convert_hf_to_gguf_update.py', 'src', 'docs', 'convert_hf_to_gguf.py', 'mypy.ini', 'llama-quantize', 'CONTRIBUTING.md', 'models', '.git', '.dockerignore', 'AUTHORS', 'requirements.txt', 'licenses', '.clang-format', 'flake.nix', 'prompts', 'tools', '.ecrc', '.flake8', 'grammars', '.devops', 'media', '.editorconfig', 'SECURITY.md', 'LICENSE', 'include', 'requirements', 'flake.lock', 'CMakePresets.json', 'ci', 'build', 'common', '.gitmodules', 'convert_lora_to_gguf.py', 'quantize']


In [19]:
model.push_to_hub_gguf(
    "tituslhy/llama32_1bn_raft_non_traditional_credentials", # Change hf to your username!
    tokenizer,
    quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
    token = os.environ["HUGGINGFACE_ACCESS_TOKEN"], # Get a token at https://huggingface.co/settings/tokens
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.45 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 102.57it/s]

Unsloth: Saving tokenizer...




 Done.
Unsloth: Saving tituslhy/llama32_1bn_raft_non_traditional_credentials/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at tituslhy/llama32_1bn_raft_non_traditional_credentials into bf16 GGUF format.
The output location will be /home/ubuntu/ideal-palm-tree/notebooks/tituslhy/llama32_1bn_raft_non_traditional_credentials/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama32_1bn_raft_non_traditional_credentials
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F3

unsloth.Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/tituslhy/llama32_1bn_raft_non_traditional_credentials
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/llama32_1bn_raft_non_traditional_credentials
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/912M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/llama32_1bn_raft_non_traditional_credentials


Test to see if we can pull our model using Ollama

In [None]:
!ollama pull hf.co/tituslhy/llama32_1bn_raft_non_traditional_credentials:Q4_K_M

In [20]:
from llama_index.llms.ollama import Ollama

llm = Ollama(
    "hf.co/tituslhy/llama32_1bn_raft_non_traditional_credentials:Q4_K_M"
)

llm.complete("Hi!")

CompletionResponse(text='Hi! How can I help you today? Do you have a question or need assistance with something.', additional_kwargs={'tool_calls': []}, raw={'model': 'hf.co/tituslhy/llama32_1bn_raft_non_traditional_credentials:Q4_K_M', 'created_at': '2025-05-20T06:47:35.098009168Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2904112368, 'load_duration': 2137232750, 'prompt_eval_count': 12, 'prompt_eval_duration': 644078380, 'eval_count': 21, 'eval_duration': 122027466, 'message': Message(role='assistant', content='Hi! How can I help you today? Do you have a question or need assistance with something.', images=None, tool_calls=None), 'usage': {'prompt_tokens': 12, 'completion_tokens': 21, 'total_tokens': 33}}, logprobs=None, delta=None)