# Install

In [None]:
!pip install --upgrade transformers bitsandbytes peft accelerate datasets trl flash_attn vllm

Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting flash_attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting vllm
  Downloading vllm-0.7.2-cp38-abi3-manylinux1_x86_64.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp31

# s1K Fine-Tuning

In [None]:
from datasets import load_dataset
import torch, multiprocessing, sys
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM


compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
tokenizer.pad_token = "<|image_pad|>"
tokenizer.pad_token_id = 151655
tokenizer.padding_side = 'left'

instruction_template = "<|im_start|>user"
response_template = "<|im_start|>assistant\n"

collator = DataCollatorForCompletionOnlyLM(
    instruction_template=instruction_template,
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

ds = load_dataset("simplescaling/s1K_tokenized", split='train')


def fine_tune(model_name, batch_size=1, gradient_accumulation_steps=32, LoRA=False, QLoRA=False):


  if QLoRA:
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
              model_name, quantization_config=bnb_config, device_map={"": 0}, attn_implementation=attn_implementation
    )
    model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})
  else:
    model = AutoModelForCausalLM.from_pretrained(
              model_name, device_map={"": 0}, torch_dtype=compute_dtype, attn_implementation=attn_implementation
    )
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})



  if LoRA or QLoRA:
    peft_config = LoraConfig(
            lora_alpha=16,
            lora_dropout=0.05,
            r=16,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules= ['k_proj', 'o_proj','q_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj'],
    )
  else:
      peft_config = None

  if LoRA:
    output_dir = "./s1-LoRA-Qwen2.5-14B-Instruct/"
  elif QLoRA:
    output_dir = "./QLoRA/"
  else:
    output_dir = "./FFT/"

  training_arguments = SFTConfig(
          output_dir=output_dir,
          optim="paged_adamw_8bit",
          per_device_train_batch_size=batch_size,
          gradient_accumulation_steps=gradient_accumulation_steps,
          log_level="debug",
          save_strategy="epoch",
          logging_steps=5,
          learning_rate=1e-6,
          bf16 = True,
          num_train_epochs=5,
          weight_decay=1e-4,
          warmup_ratio=0.05,
          lr_scheduler_type="cosine",
          dataset_text_field="text",
          max_seq_length=32000,
          report_to='none'
  )

  trainer = SFTTrainer(
          model=model,
          train_dataset=ds,
          peft_config=peft_config,
          data_collator=collator,
          args=training_arguments,
  )

  #--code by Unsloth: https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=pCqnaKmlO1U9

  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")

  trainer_ = trainer.train()


  used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  used_memory_for_trainer= round(used_memory - start_gpu_memory, 3)
  used_percentage = round(used_memory         /max_memory*100, 3)
  trainer_percentage = round(used_memory_for_trainer/max_memory*100, 3)
  print(f"{trainer_.metrics['train_runtime']} seconds used for training.")
  print(f"{round(trainer_.metrics['train_runtime']/60, 2)} minutes used for training.")
  print(f"Peak reserved memory = {used_memory} GB.")
  print(f"Peak reserved memory for training = {used_memory_for_trainer} GB.")
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
  print(f"Peak reserved memory for training % of max memory = {trainer_percentage} %.")
  print("-----")
  #----


In [None]:
fine_tune("Qwen/Qwen2.5-7B-Instruct", batch_size=1, gradient_accumulation_steps=32, LoRA=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using auto half precision backend
Currently training with a batch size of: 1
***** Running training *****
  Num examples = 1,000
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 32
  Total optimization steps = 155
  Number of trainable parameters = 40,370,176


GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
14.432 GB of memory reserved.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,0.4581
10,0.4901
15,0.5166
20,0.4768
25,0.4883
30,0.4992
35,0.4899
40,0.4764
45,0.4879
50,0.4767


Saving model checkpoint to drive/MyDrive/s1-LoRA-Qwen2.5-14B-Instruct/checkpoint-32
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28/config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.48.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 152064
}

tokenizer config file save

10229.2039 seconds used for training.
170.49 minutes used for training.
Peak reserved memory = 37.047 GB.
Peak reserved memory for training = 22.615 GB.
Peak reserved memory % of max memory = 93.655 %.
Peak reserved memory for training % of max memory = 57.171 %.
-----


# Budget Forcing with vLLM

This is the original code the [s1 repository](https://github.com/simplescaling/s1), modified to support adapters.

In [None]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from transformers import AutoTokenizer

MAX_TOKENS_THINKING = 20000
# Decide how often to ignore end-of-thinking token
NUM_IGNORE = 2
adapter_path="s1-LoRA-phi4-14B-Instruct/checkpoint-155"
model = LLM(
    "microsoft/phi-4",
    enable_lora=True,
    max_lora_rank=16,
    tensor_parallel_size=1,
)

s1_adapter = LoRARequest("s1", 1, adapter_path)
tok = AutoTokenizer.from_pretrained(
    "s1-LoRA-phi4-14B-Instruct/checkpoint-155"
)

stop_token_ids = tok("<|im_end|>")["input_ids"]
sampling_params = SamplingParams(
    max_tokens=32768,
    min_tokens=0,
    stop_token_ids=stop_token_ids,
    skip_special_tokens=False,
    temperature=0.0,
)

# For the exact raspberry sample in the paper, change
# model to `qfq/1k_qr_bt_dm_po_steps` (an earlier version of s1)
# & prompt to `How many r in raspberry?`
prompts = [
    "9.11 and 9.9, which is bigger?",
]

for i, p in enumerate(prompts):
    prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + p + "<|im_end|>\n<|im_start|>assistant\n"
    stop_token_ids = tok("<|im_start|><|im_end|>")["input_ids"]
    sampling_params = SamplingParams(
        max_tokens=MAX_TOKENS_THINKING,
        min_tokens=0,
        stop_token_ids=stop_token_ids,
        skip_special_tokens=False,
        temperature=0.0,
    )
    prompt += "<|im_start|>think"
    o = model.generate(
        prompt,
        sampling_params=sampling_params,
        lora_request=s1_adapter
    )
    ignore_str = "Wait"
    max_tokens_thinking_tmp = MAX_TOKENS_THINKING
    # Num of times to skip stop token
    for i in range(NUM_IGNORE):
        max_tokens_thinking_tmp -= len(o[0].outputs[0].token_ids)
        prompt += o[0].outputs[0].text + ignore_str
        sampling_params = SamplingParams(
            max_tokens=max_tokens_thinking_tmp,
            min_tokens=0,
            stop_token_ids=stop_token_ids,
            skip_special_tokens=False,
            temperature=0.0,
        )
        o = model.generate(
            prompt,
            sampling_params=sampling_params
        )
    ### Final answer ###
    prompt += o[0].outputs[0].text
    stop_token_ids = tok("<|im_end|>")["input_ids"]
    sampling_params = SamplingParams(
        max_tokens=32768,
        min_tokens=0,
        stop_token_ids=stop_token_ids,
        skip_special_tokens=False,
        temperature=0.0,
    )
    o = model.generate(
        prompt,
        sampling_params=sampling_params,
    )
    print("With budget forcing:")
    print(prompt + o[0].outputs[0].text)

INFO 02-11 03:38:55 __init__.py:190] Automatically detected platform cuda.
INFO 02-11 03:39:10 config.py:542] This model supports multiple tasks: {'generate', 'classify', 'score', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 02-11 03:39:10 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 02-11 03:39:19 model_runner.py:1115] Loading model weights took 14.2487 GB
INFO 02-11 03:39:19 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-11 03:39:24 worker.py:267] Memory profiling takes 4.31 seconds
INFO 02-11 03:39:24 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 02-11 03:39:24 worker.py:267] model weights take 14.25GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 4.35GiB; the rest of the memory reserved for KV Cache is 16.91GiB.
INFO 02-11 03:39:24 executor_base.py:110] # CUDA blocks: 19785, # CPU blocks: 4681
INFO 02-11 03:39:24 executor_base.py:115] Maximum concurrency for 32768 tokens per request: 9.66x
INFO 02-11 03:39:28 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

INFO 02-11 03:40:00 model_runner.py:1562] Graph capturing finished in 33 secs, took 0.35 GiB
INFO 02-11 03:40:00 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 41.35 seconds



Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 41.25 toks/s, output: 39.02 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.24it/s, est. speed input: 159.59 toks/s, output: 65.18 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.90it/s, est. speed input: 290.81 toks/s, output: 63.98 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 23.79it/s, est. speed input: 2886.75 toks/s, output: 23.85 toks/s]

With budget forcing:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
How many r in raspberry?<|im_end|>
<|im_start|>assistant
<|im_start|>think
Raspberry has two "r"s. The word is spelled "raspberry" with the first "r" and the second "r" right after it.Wait, let me double-check the spelling just to be sure.

Raspberry is indeed spelled with two "r"s: "raspberry."Wait, I see the spelling is correct. The word "raspberry" has two "r"s.



