# Post Training - Supervised Fine-Tuning (SFT)

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Import libraries

In [2]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig

## Setting up helper functions

In [3]:
def generate_responses(model, tokenizer, user_message, system_message=None, 
                       max_new_tokens=100):
    #Formating chat using tokenizer's chat template:
    #Preparing a list of chat messages (structured format):
    messages = []
    
    #If a system message is provided, adding it first:
    #System messages define assistant behavior (e.g., tone, personality):
    if system_message:
        messages.append({"role": "system", "content": system_message})
    
    #Add the user message as the next entry (it's a single-turn chat setup):
    messages.append({"role": "user", "content": user_message})
    
    
    #Tokenizing the prompt into input IDs and move to the model's device (CPU or GPU):
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, #Return raw text prompt, not tokenized output.
        add_generation_prompt=True, #Add assistant's cue to prompt generation.
        enable_thinking=False, #Optional setting (used in some chat-aware models).
    )
    
    #Disabling gradient calculation to save memory (inference-only):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    #Recommended to use vllm, sglang or TensorRT (For trying different menthods for inference):
    with torch.no_grad():
        #Generating output tokens from the model:
        outputs = model.generate(
            **inputs,   #Using a double pointer for unpacking the dictionary of inputs (model.generate(**inputs)) that is equivalent to (model.generate(input_ids=..., attention_mask=...)).
            max_new_tokens=max_new_tokens, #Limit the number of tokens generated.
            do_sample=False, #Disabling randomness (greedy decoding).
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1] #Getting the length of the input (so we can extract only the newly generated tokens).
    generated_ids = outputs[0][input_len:] #Slicing the output to keep only the new tokens (assistant's response).
    
    #Decoding the generated token IDs back into text:
    #`skip_special_tokens=True` removing tokens like <|endoftext|>
    #Strip() removes any leading/trailing whitespace or newline characters from the output string to keeps the model output clean and ready to display or use.
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(model, tokenizer, questions, 
                              system_message=None, title="Model Output"):
    #Printing section title for clarity (e.g., "Base Model (Before SFT) Output")
    print(f"\n=== {title} ===")
    
    #Looping through each question in the list, starting index at 1:
    for i, question in enumerate(questions, 1):
        #Generating a model response for the current question:
        #Passing in the question as user input and optional system message
        response = generate_responses(model, tokenizer, question, 
                                      system_message)
        #Print both the input question and the model's output response:
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [5]:
def load_model_and_tokenizer(model_name, use_gpu = False):
    
    #Loading tokenizer from the given model path or HuggingFace Hub name:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    #Loading causal language model (this is a GPT-style decoder-only model):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    #If GPU is requested and available, move the model to CUDA:
    if use_gpu:
        model.to("cuda")
    
    #If the tokenizer does not already have a chat template, defined a custom one:
    #This template is used to format multi-turn conversations into a prompt string:
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    #Ensuring tokenizer has a pad token — fallback to eos token if missing:
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
    
    #Returning the ready-to-use model and tokenizer:   
    return model, tokenizer

In [20]:
def display_dataset(dataset):
    # Visualize the dataset 
    rows = []
    for i in range(3):
        example = dataset[i]
        print("*"*20)
        print(f"example {i} : {example}\n")
        user_msg = next(m['content'] for m in example['messages']
                        if m['role'] == 'user')
        print(f"user_msg: {user_msg} \n")
        assistant_msg = next(m['content'] for m in example['messages']
                             if m['role'] == 'assistant')
        print(f"assistant_msg: {assistant_msg} \n")
        
        #Append a dictionary with user and assistant messages to the rows list:
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })
        print(f"rows: {rows} \n\n")
    
    #Display as table:
    #Convert the list of dictionaries into a pandas DataFrame:
    df = pd.DataFrame(rows)
    
    #Ensuring full text is shown without truncation in display:
    pd.set_option('display.max_colwidth', None)  #Avoiding truncating long strings.
    display(df)

## Load base model & test on simple questions

In [7]:
USE_GPU = False

questions = [
    "Give me an 1-sentence introduction of LLM.",
    "Calculate 1+1-1",
    "What's the difference between thread and process?"
]

In [8]:
model, tokenizer = load_model_and_tokenizer("./models/Qwen/Qwen3-0.6B-Base", USE_GPU)

test_model_with_questions(model, tokenizer, questions, 
                          title="Base Model (Before SFT) Output")

del model, tokenizer


=== Base Model (Before SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ �


Model Input 2:
Calculate 1+1-1
Model Output 2:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �


Model Input 3:
What's the difference between thread and process?
Model Output 3:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �



In [9]:
for i, question in enumerate(questions):
    print(i)
    print(question)

0
Give me an 1-sentence introduction of LLM.
1
Calculate 1+1-1
2
What's the difference between thread and process?


In [10]:
for i, question in enumerate(questions,1):
    print(i)
    print(question)

1
Give me an 1-sentence introduction of LLM.
2
Calculate 1+1-1
3
What's the difference between thread and process?


### `enumerate(questions)` vs `enumerate(questions, 1)`

When using the `enumerate()` function in Python, you can control where the index starts from by providing a second argument.

---

#### Example 1: `enumerate(questions, 1)`

```python
for i, question in enumerate(questions, 1):
    print(i)
    print(question)
````

**Output:**

```
1
Give me an 1-sentence introduction of LLM.
2
Calculate 1+1-1
3
What's the difference between thread and process?
```

* The index `i` starts from **1**.
* Useful for **human-readable output**, like "Question 1", "Model Output 2", etc.

---

#### Example 2: `enumerate(questions)`

```python
for i, question in enumerate(questions):
    print(i)
    print(question)
```

**Output:**

```
0
Give me an 1-sentence introduction of LLM.
1
Calculate 1+1-1
2
What's the difference between thread and process?
```

* The index `i` starts from **0** (default behavior).
* Useful when zero-based indexing matters (e.g., list positions, array access).

---

### Key Points

| Expression                | Index Starts From | Skips Anything? | Use Case                       |
| ------------------------- | ----------------- | --------------- | ------------------------------ |
| `enumerate(questions)`    | 0                 | ❌ No            | Programming logic, loops       |
| `enumerate(questions, 1)` | 1                 | ❌ No            | Human-friendly display/logging |

> Changing the start index does **not** skip or change the list items — it only affects the index value you get during iteration.

## SFT results on Qwen3-0.6B model

We are reviewing the results of a previously completed SFT training. Due to limited resources, we won’t be running the full training on a relatively large model like Qwen3-0.6B. 
However, in the next section of this notebook, you’ll walk through the full training process using a smaller model and a lightweight dataset.

In [11]:
model, tokenizer = load_model_and_tokenizer("./models/banghua/Qwen3-0.6B-SFT", USE_GPU)

test_model_with_questions(model, tokenizer, questions, 
                          title="Base Model (After SFT) Output")

del model, tokenizer


=== Base Model (After SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
LLM is a program that provides advanced legal knowledge and skills to professionals and individuals.


Model Input 2:
Calculate 1+1-1
Model Output 2:
1+1-1 = 2-1 = 1

So, the final answer is 1.


Model Input 3:
What's the difference between thread and process?
Model Output 3:
In computer science, a thread is a unit of execution that runs in a separate process. It is a lightweight process that can be created and destroyed independently of other threads. Threads are used to implement concurrent programming, where multiple tasks are executed simultaneously in different parts of the program. Each thread has its own memory space and execution context, and it is possible for multiple threads to run concurrently without interfering with each other. Threads are also known as lightweight processes.



## Doing SFT on a small model

Performing SFT on a small model <code>HuggingFaceTB/SmolLM2-135M</code> and a smaller training dataset.

In [12]:
model_name = "./models/HuggingFaceTB/SmolLM2-135M"
model, tokenizer = load_model_and_tokenizer(model_name, USE_GPU)

In [21]:
#Load the SFT training dataset from Hugging Face hub:
train_dataset = load_dataset("banghua/DL-SFT-Dataset")["train"]

if not USE_GPU:
    train_dataset=train_dataset.select(range(100))

display_dataset(train_dataset)

********************
example 0 : {'messages': [{'content': "- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.", 'role': 'user'}, {'content': "This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree.", 'role': 'assistant'}]}

user_msg: - The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value. 

assistant_msg: This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficie

Unnamed: 0,User Prompt,Assistant Response
0,"- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.","This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree."
1,"To pass three levels must be the plan.\nThen tackle Two, when that is done.\nOf 100 that start, at the end will be 20.\nFinQuiz is a website that helps you prepare.\nUse it to be stress-free, and not lose your hair.\nThen, take the exam with a smile on your face.\nBe confident that you will gain your place.\nSo make this the goal to which you aspire. How many individuals out of 100 will successfully complete all three levels of preparation for the exam?","Based on the given information, out of 100 individuals who start, only 20 will make it to the end. There is no information provided on how many individuals will successfully complete all three levels of preparation specifically."
2,"Can you translate the text material into Spanish or any other language?: He really is, you know.\nThings a hero should show.\nHe loves me more than a zillion things.\nHe loves me when I sing my jolly folktale rhymes.\nHe's good, not just good, in fact he's great!\nBut because he's my best mate!\nWOW !!! I love it!!!!","¿Puede traducir el texto a español o a cualquier otro idioma?: \nRealmente lo es, ya sabes.\nCosas que un héroe debería demostrar.\nMe quiere más que un millón de cosas.\nMe quiere cuando canto mis alegres rimas de cuentos populares.\nEs bueno, no solo bueno, ¡de hecho es genial!\n¡Pero porque es mi mejor amigo!\n¡WOW! ¡Me encanta!"


In [22]:
# SFTTrainer config
#Defines how training should be run, including learning rate, epochs, batch size, etc:

sft_config = SFTConfig(
    learning_rate=8e-5, # Learning rate for training. 
    num_train_epochs=1, #  Set the number of epochs to train the model.
    per_device_train_batch_size=1, # Batch size for each device (e.g., GPU) during training. 
    gradient_accumulation_steps=8, # Number of steps before performing a backward/update pass to accumulate gradients.
    gradient_checkpointing=False, # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed.
    logging_steps=2,  # Frequency of logging training progress (log every 2 steps).

)

In [23]:
sft_trainer = SFTTrainer(
    model=model, #The base model to be fine-tuned.
    args=sft_config, #Training arguments defined above.
    train_dataset=train_dataset, #Dataset to fine-tune the model on.
    processing_class=tokenizer, #Tokenizer used for processing input text.
)
sft_trainer.train()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
2,2.6086
4,2.4234
6,2.3605
8,2.1261
10,2.191
12,2.0751


TrainOutput(global_step=12, training_loss=2.297439376513163, metrics={'train_runtime': 49.3189, 'train_samples_per_second': 2.028, 'train_steps_per_second': 0.243, 'total_flos': 10120976949888.0, 'train_loss': 2.297439376513163, 'epoch': 0.96})

- global_step: Total number of optimizer steps (i.e., weight updates) performed during training.
- training_loss: Final averaged training loss over all global steps.
- train_runtime: Total time taken to complete training, in seconds.
- train_samples_per_second: Number of training examples processed per second.
- train_steps_per_second: Number of optimizer steps (i.e., gradient updates) performed per second.
- total_flos: Estimated number of floating point operations used during training.
- epoch: Portion of the training epoch completed (1.0 = full dataset seen once).

## Testing training results on small model and small dataset

**Note:** The following results are for the small model and dataset we used for SFT training, due to limited computational resources. To view the results of full-scale training on a larger model, see the **"SFT Results on Qwen3-0.6B Model"** section above.

In [24]:
if not USE_GPU: #moving model to CPU when GPU isn’t requested:
    sft_trainer.model.to("cpu")

#Evaluating the fine-tuned model’s performance on the same test questions:
test_model_with_questions(sft_trainer.model, tokenizer, questions, 
                          title="Base Model (After SFT) Output")


=== Base Model (After SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
The course is designed to provide students with a solid foundation in the theory and practice of law. The course is designed to provide students with a solid foundation in the theory and practice of law. The course is designed to provide students with a solid foundation in the theory and practice of law. The course is designed to provide students with a solid foundation in the theory and practice of law. The course is designed to provide students with a solid foundation in the theory and practice of law. The course is designed


Model Input 2:
Calculate 1+1-1
Model Output 2:
1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+


Model Input 3:
What's the difference between thread and process?
Model Output 3:
Thread is a single process that is running in a single process space. A thread is a single process that is