In [None]:
# Install the TRL library for training language models
!pip install trl



In [None]:
# Import user credentials and Hugging Face login utilities
from google.colab import userdata
import os
from huggingface_hub import login as hf_login

In [None]:
# Log in to Hugging Face using your token
hf_login(token=userdata.get("hf_token"))

In [None]:
# Import model, tokenizer, and training utilities
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch

# Select device: CUDA, MPS, or CPU
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Load the base model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name
).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up chat formatting for the model and tokenizer
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Name for the fine-tuned model
finetune_name = "SmolLM2-FT-SQL"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create a dummy sentence to inspect formatting and tokenization
# This helps verify how the model processes input

dummy_sentence = "This is a dummy sentence to observe formatting and tokenization."

# Print the original sentence
print("Original sentence:", dummy_sentence)

# Tokenize the sentence and print tokens
tokens = tokenizer.tokenize(dummy_sentence)
print("Tokens:", tokens)

# Format the sentence as a chat message and print the result
chat_template_messages = [{"role": "user", "content": dummy_sentence}]
chat_formatted_input = tokenizer.apply_chat_template(
    chat_template_messages, tokenize=False, add_generation_prompt=True
)

print("Chat formatted input:", chat_formatted_input)

Original sentence: This is a dummy sentence to observe formatting and tokenization.
Tokens: ['This', 'Ġis', 'Ġa', 'Ġdummy', 'Ġsentence', 'Ġto', 'Ġobserve', 'Ġformatting', 'Ġand', 'Ġtoken', 'ization', '.']
Chat formatted input: <|im_start|>user
This is a dummy sentence to observe formatting and tokenization.<|im_end|>
<|im_start|>assistant



In [None]:
# Load the synthetic text-to-SQL dataset
ds = load_dataset(path="gretelai/synthetic_text_to_sql", name="default")

In [None]:
# Check available columns in the training split
ds["train"].column_names

['id',
 'domain',
 'domain_description',
 'sql_complexity',
 'sql_complexity_description',
 'sql_task_type',
 'sql_task_type_description',
 'sql_prompt',
 'sql_context',
 'sql',
 'sql_explanation']

In [None]:
# Format each example as a chat conversation for training
def format_conversation(example):
    return [
        {"role": "user", "content": example["sql_prompt"]},
        {"role": "assistant", "content": example["sql"]},
    ]


ds["train"] = ds["train"].map(lambda x: {"messages": format_conversation(x)})
ds["test"] = ds["test"].map(lambda x: {"messages": format_conversation(x)})

In [None]:
# View the first formatted conversation in the training set
ds["train"]["messages"][0]

[{'content': 'What is the total volume of timber sold by each salesperson, sorted by salesperson?',
  'role': 'user'},
 {'content': 'SELECT salesperson_id, name, SUM(volume) as total_volume FROM timber_sales JOIN salesperson ON timber_sales.salesperson_id = salesperson.salesperson_id GROUP BY salesperson_id, name ORDER BY total_volume DESC;',
  'role': 'assistant'}]

In [None]:
# Remove the 'text' column if it exists, as it's not needed for training
if "text" in ds["train"].column_names:
    ds["train"] = ds["train"].remove_columns("text")

if "text" in ds["test"].column_names:
    ds["test"] = ds["test"].remove_columns("text")

In [None]:
# Disable Weights & Biases logging (optional)
os.environ["WANDB_DISABLED"] = "true"

# Configure supervised fine-tuning (SFT) parameters
sft_config = SFTConfig(
    output_dir="./sft_output",
    max_steps=3000,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    hub_model_id=finetune_name,
    fp16=True,
    report_to=None,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Check columns after preprocessing
ds["train"].column_names

['id',
 'domain',
 'domain_description',
 'sql_complexity',
 'sql_complexity_description',
 'sql_task_type',
 'sql_task_type_description',
 'sql_prompt',
 'sql_context',
 'sql',
 'sql_explanation',
 'messages']

In [None]:
# Initialize the supervised fine-tuning trainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=ds["train"],
    processing_class=tokenizer,
    eval_dataset=ds["test"],
)

In [None]:
# Train the model on the dataset
trainer.train()

# Save the fine-tuned model
trainer.save_model(f"./{finetune_name}")

Step,Training Loss
10,2.9956
20,1.989
30,1.5913
40,1.4069
50,1.3018
60,1.3108
70,1.2754
80,1.2679
90,1.1858
100,1.197


Step,Training Loss
10,2.9956
20,1.989
30,1.5913
40,1.4069
50,1.3018
60,1.3108
70,1.2754
80,1.2679
90,1.1858
100,1.197


In [None]:
# Load the fine-tuned model and tokenizer for inference
model_path = "./sft_output"
new_tokenizer = AutoTokenizer.from_pretrained(model_path)
new_model = AutoModelForCausalLM.from_pretrained(model_path)
new_model = new_model.to(device)

In [None]:
# Test the fine-tuned model with a sample prompt

# Format the prompt using the chat template
prompt = "I want the first 10 movies that have ever reached a grade of 9/10 on rotten tomatotes."
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Tokenize the formatted prompt for model input
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

In [None]:
# Generate a response from the model and decode it
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

user
I want the first 10 movies that have ever reached a grade of 9/10 on rotten tomatotes.
assistant
SELECT movie_id, grade FROM rotten_tomatoes WHERE grade = 9 AND movie_id > 0;


In [None]:
# Push the fine-tuned model to the Hugging Face Hub
trainer.push_to_hub()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

events.out.tfevents.1748092566.17d20bcb9491.16093.0:   0%|          | 0.00/105k [00:00<?, ?B/s]

events.out.tfevents.1748091696.17d20bcb9491.1072.0:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

events.out.tfevents.1748092251.17d20bcb9491.1072.1:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thiborose/SmolLM2-FT-SQL/commit/e27aad042ef7bd879f94a655d4555cfa464b6a90', commit_message='End of training', commit_description='', oid='e27aad042ef7bd879f94a655d4555cfa464b6a90', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thiborose/SmolLM2-FT-SQL', endpoint='https://huggingface.co', repo_type='model', repo_id='thiborose/SmolLM2-FT-SQL'), pr_revision=None, pr_num=None)