## Install Dependencies

In [None]:
%%capture
!pip install unsloth unsloth_zoo python_dotenv

## Setup Model

In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/DeepSeek-R1-Distill-Qwen-14B"
max_seq_length = 2048
dtype = None # None for auto detection.
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/182k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.35G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.78k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

## Add RAG Token

In [None]:
import torch
import gc

def add_new_tokens(model, tokenizer, new_tokens=[], method="mean", interpolation=0.5):
    assert isinstance(new_tokens, (list, tuple))
    assert len(new_tokens) > 0
    assert method in ["mean", "interpolation"]
    assert 0 <= interpolation <= 1

    overlapping_tokens = set(new_tokens) & set(tokenizer.vocab.keys())
    if overlapping_tokens:
        print(f"Unsloth: Skipping overlapping tokens: {list(overlapping_tokens)}")
        new_tokens = [x for x in new_tokens if x not in overlapping_tokens]

    # Add new tokens to tokenizer
    old_length = len(tokenizer)
    tokenizer.add_tokens(new_tokens)

    # Fix — resize before accessing embedding matrix
    model.resize_token_embeddings(len(tokenizer))

    # Get mean embedding
    embedding_matrix = model.get_input_embeddings().weight.clone()
    lm_head_matrix = model.get_output_embeddings().weight.clone()
    eps = 1e-16
    indicator_untrained = torch.amax(embedding_matrix, axis=1) <= eps
    where_untrained = torch.where(indicator_untrained)[0]
    n_untrained = where_untrained.shape[0]
    n_trained = embedding_matrix.shape[0] - n_untrained
    sum_embedding = embedding_matrix.sum(dim=0) - embedding_matrix[where_untrained].sum(dim=0)
    sum_lm_head = lm_head_matrix.sum(dim=0) - lm_head_matrix[where_untrained].sum(dim=0)
    mean_embedding = (sum_embedding / n_trained).to(torch.float32)
    mean_lm_head = (sum_lm_head / n_trained).to(torch.float32)

    embedding_matrix = model.get_input_embeddings().weight
    lm_head_matrix = model.get_output_embeddings().weight

    if method == "interpolation":
        print("Using interpolation for initializing new tokens.")
        for j, token in enumerate(new_tokens):
            input_ids = tokenizer(token, add_special_tokens=False).input_ids
            token_mean_emb = embedding_matrix[input_ids].mean(dim=0)
            token_mean_head = lm_head_matrix[input_ids].mean(dim=0)

            emb = mean_embedding * (1 - interpolation) + token_mean_emb * interpolation
            head = mean_lm_head * (1 - interpolation) + token_mean_head * interpolation

            embedding_matrix[old_length + j] = emb
            lm_head_matrix[old_length + j] = head
    else:
        embedding_matrix.data[old_length:] = mean_embedding
        lm_head_matrix.data[old_length:] = mean_lm_head

    model.config.vocab_size = len(tokenizer)
    if hasattr(model, "tie_weights"):
        model.tie_weights()

    for _ in range(3):
        gc.collect()
        torch.cuda.empty_cache()
    print(f"✅ Added {len(new_tokens)} new tokens to the tokenizer and model.")


In [None]:
# from unsloth import add_new_tokens

# Define special tokens
special_tokens = {
    "<search_query>",
    "</search_query>",
    "<search_result>",
    "</search_result>"
}

# Exclude existing tokens
new_tokens = list(special_tokens - set(tokenizer.vocab.keys()))

if new_tokens:
    print("🛠️ Adding new tokens...")

    add_new_tokens(model, tokenizer, new_tokens=new_tokens)

    print(f"✅ Successfully added {len(new_tokens)} new tokens!")
else:
    print("ℹ️ No new tokens to add.")


🛠️ Adding new tokens...
✅ Added 4 new tokens to the tokenizer and model.
✅ Successfully added 4 new tokens!


## Data Prep

We start by mounting the drive so that we can access the dataset we'll be using for fine-tuning.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


We define a system prompt and include placeholders for the question and response generation. This prompt will guide the model to think step-by-step and provide a logical, accurate answer.

In [None]:
training_system_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.

Write a response that appropriately completes the request.

Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

If the answer relies on obscure concepts, knowledge unlikely to be in pretraining, or time-sensitive facts, trigger retrieval by:
1. Output a search query between <search_query> and </search_query> tags
2. Wait for search results between <search_result> and </search_result>
3. Use these results to complete your answer

Multiple retrievals may be included in a single Chain of Thought, including multiple lookups for the same topic (e.g., clarification followed by fact checking).

### Instruction:
You are a scientific expert with advanced knowledge in analytical reasoning, problem-solving, and quantitative analysis across various disciplines, including mathematics, physics, biology, chemistry, and engineering. Please answer the following question.

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""

We load in our dataset from Google Drive.

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/drive/MyDrive/SNLP/dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

We now format the dataset to fit our prompting style.

In [None]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["CoT"]
    outputs = examples["Response"]

    texts = []

    for input, cot, output in zip(inputs, cots, outputs):
        text = training_system_prompt.format(input, cot, output) + EOS_TOKEN
        texts.append(text)

    return {
        "text" : texts
    }

dataset_finetune = dataset.map(formatting_prompts_func, batched = True)


Map:   0%|          | 0/998 [00:00<?, ? examples/s]

## Setup LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Determines the number of trainable adapters. Lower -> Less Expensive.
    target_modules = ["q_proj",
                      "k_proj",
                      "v_proj",
                      "o_proj",
                      "gate_proj",
                      "up_proj",
                      "down_proj",
                      "lm_head",
                      "embed_tokens"],
    modules_to_save=["lm_head", "embed_tokens"],
    lora_alpha = 16, # The higher this is, the more weight changes the LoRA process will do to these layers.
    lora_dropout = 0, # Dropout means how much information you retain in the weight updating process. Here we set full retention of information.
    bias = "none", # This specifies whether the LoRA layers we're updating should learn bias terms.
    use_gradient_checkpointing = "unsloth", # Saves memory by recomputing the activations that we're doing instead of storing them. Especially useful for fine-tuning on long datasets with long context.
    random_state = 3407, # Set random seed for reproducability.
    use_rslora = True,
    loftq_config = None, # This is low bit fine-tuning quantization since we already have 4-bit quantization.
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.3.19 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


## Fine-tune Model

This code sets up an SFTTrainer to fine-tune a transformer model with specified training parameters and dynamically generates an output directory.









In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datetime import datetime

output_dir = f"/content/drive/MyDrive/SNLP/{model_name.replace('/', '_').replace('-', '_')}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_finetune,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,

    # Define training arguments
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        num_train_epochs = 1,
        warmup_steps = 5,
        max_steps = 65,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/998 [00:00<?, ? examples/s]

Train the model.

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 998 | Num Epochs = 1 | Total steps = 65
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 1,621,903,360/14,000,000,000 (11.59% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.6996
2,1.6862
3,1.5887
4,1.5773
5,1.2413
6,1.0329
7,0.9737
8,0.849
9,0.8942
10,0.7166




## Upload to HuggingFace

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("canertugrul/DeepSeek-R1-Distill-Qwen-14B-Tool-Use-Adapter_v3")
tokenizer.push_to_hub("canertugrul/DeepSeek-R1-Distill-Qwen-14B-Tool-Use-Tokenizer_v3")

README.md:   0%|          | 0.00/630 [00:00<?, ?B/s]



  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/9.59G [00:00<?, ?B/s]

Saved model to https://huggingface.co/canertugrul/DeepSeek-R1-Distill-Qwen-14B-Tool-Use-Adapter_v3


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

## Inference on Trained Model

In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel
import torch

from dotenv import load_dotenv
import os

# Setup Environment
dotenv_path = "/content/drive/MyDrive/SNLP/.env"
load_dotenv(dotenv_path = dotenv_path)

# Extract Hugging Face Token
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

# Define prompt for model
inference_system_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.

Write a response that appropriately completes the request.

Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

If the answer relies on obscure concepts, knowledge unlikely to be in pretraining, or time-sensitive facts, trigger retrieval by:
1. Output a search query between <search_query> and </search_query> tags
2. Wait for search results between <search_result> and </search_result>
3. Use these results to complete your answer

Multiple retrievals may be included in a single Chain of Thought, including multiple lookups for the same topic (e.g., clarification followed by fact checking).

Today's date is April 1, 2025.

### Instruction:
You are a scientific expert with advanced knowledge in analytical reasoning, problem-solving, and quantitative analysis across various disciplines, including mathematics, physics, biology, chemistry, and engineering. Please answer the following question.

### Question:
{}

### Response:
{}
"""

question = "What is the difference in inflation rate between December 2024 and January 2025?"

FastLanguageModel.for_inference(model)
inputs = tokenizer([inference_system_prompt.format(question, "")], return_tensors="pt").to(model.device)

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=2048,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs)

print(response[0])

<｜begin▁of▁sentence｜>Below is an instruction that describes a task, paired with an input that provides further context.

Write a response that appropriately completes the request.

Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

If the answer relies on obscure concepts, knowledge unlikely to be in pretraining, or time-sensitive facts, trigger retrieval by:
1. Output a search query between <search_query> and </search_query> tags
2. Wait for search results between <search_result> and </search_result>
3. Use these results to complete your answer

Multiple retrievals may be included in a single Chain of Thought, including multiple lookups for the same topic (e.g., clarification followed by fact checking).

Today's date is April 1, 2025.

### Instruction:
You are a scientific expert with advanced knowledge in analytical reasoning, problem-solving, and quantitative analysis across various disciplines