# Exploring Chat Templates with SmolLM2
 Chat templates help structure interactions between users and AI models, ensuring consistent and contextually appropriate responses.

In [None]:
# !pip install transformers datasets trl huggingface_hub python-dotenv

In [1]:
# Authenticate to Hugging Face
from huggingface_hub import login
from dotenv import load_dotenv
import os

# 加载.env文件
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
# 导入必要的包
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import setup_chat_format
import torch

# SmolLM2 Chat Template
定义一个简单的对话并应用聊天模板。

In [3]:
# Dynamically set the device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

cache_dir = "../../huggingface"
model_name = "HuggingFaceTB/SmolLM2-135M"

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name, cache_dir=cache_dir)
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [4]:
# Define messages for SmolLM2
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {
        "role": "assistant",
        "content": "I'm doing well, thank you! How can I assist you today?",
    },
]

# Apply chat template without tokenization
分词器将对话表示为带有特殊标记的字符串，用于描述用户和助手的角色。

In [5]:
input_text = tokenizer.apply_chat_template(messages, tokenize=False)

print("Conversation with template:", input_text)

Conversation with template: <|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing well, thank you! How can I assist you today?<|im_end|>



## Decode the conversation

In [6]:
input_text = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True
)

print("Conversation decoded:", tokenizer.decode(token_ids=input_text))

Conversation decoded: <|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing well, thank you! How can I assist you today?<|im_end|>
<|im_start|>assistant



## Tokenize the conversation
分词器还会将对话和特殊分词作为与模型词汇相关的id。

In [7]:
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

print("Conversation tokenized:", input_text)

Conversation tokenized: [1, 4093, 198, 19556, 28, 638, 359, 346, 47, 2, 198, 1, 520, 9531, 198, 57, 5248, 2567, 876, 28, 9984, 346, 17, 1073, 416, 339, 4237, 346, 1834, 47, 2, 198, 1, 520, 9531, 198]


# Exercise: Process a dataset for SFT
从HuggingFace获取数据集，并针对 SFT 对其进行处理。

🐢 Convert the `HuggingFaceTB/smoltalk` dataset into chatml format.

🐕 Convert the `openai/gsm8k` dataset into chatml format.

In [8]:
from IPython.core.display import display, HTML

display(
    HTML(
        """<iframe
  src="https://huggingface.co/datasets/HuggingFaceTB/smoltalk/embed/viewer/all/train?row=0"
  frameborder="0"
  width="100%"
  height="360px"
></iframe>
"""
    )
)

  from IPython.core.display import display, HTML


In [36]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations")

In [38]:
print(ds)
print("----------------------------")
print(ds['train'][0])

def process_dataset(sample):
    # TODO: 🐢 Convert the sample into a chat format
    # use the tokenizer's method to apply the chat template
    sample['messages'] = tokenizer.apply_chat_template(sample['messages'],add_generation_prompt=True, tokenize=True)
    return sample

ds_processed = ds.map(process_dataset)
print(ds_processed)
print("----------------------------")
print(ds_processed['train'][0])

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})
----------------------------
{'full_topic': 'Travel/Vacation destinations/Beach resorts', 'messages': [{'content': 'Hi there', 'role': 'user'}, {'content': 'Hello! How can I help you today?', 'role': 'assistant'}, {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?", 'role': 'user'}, {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.", 'role': 'assistant'}, {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?', 'role': 'user'}, {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of 

In [39]:
tokenizer.decode(ds_processed['train'][0]['messages'])

"<|im_start|>user\nHi there<|im_end|>\n<|im_start|>assistant\nHello! How can I help you today?<|im_end|>\n<|im_start|>user\nI'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>\n<|im_start|>assistant\nSome popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>\n<|im_start|>user\nThat sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>\n<|im_start|>assistant\nYes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>\n<|im_start|>user\nOkay, I'll look into those. Thanks for the recommendations!<|im_end|>\n<|im_start|>assistant\nYou're welcome. I hope you find the perfect resort for your vacation.<|im_end|>\n<|im_start|>assistant\n"

In [40]:
display(
    HTML(
        """<iframe
  src="https://huggingface.co/datasets/openai/gsm8k/embed/viewer/main/train"
  frameborder="0"
  width="100%"
  height="360px"
></iframe>
"""
    )
)

In [41]:
ds = load_dataset("openai/gsm8k", "main")
ds['train'][1]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
 'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'}

In [44]:
def process_dataset(sample):
    # TODO: 🐕 Convert the sample into a chat format
    # 1. create a message format with the role and content
    user_text = sample['question']
    assistant_text = sample['answer']
    # 2. apply the chat template to the samples using the tokenizer's method
    sample['messages'] = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": assistant_text},
        ],
        add_generation_prompt=False,
        tokenize=True
    )
    return sample


ds_processed = ds.map(process_dataset)
print(ds_processed)
print("----------------------------")
print(ds_processed['train'][1])
tokenizer.decode(ds_processed['train'][1]['messages'])

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'messages'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer', 'messages'],
        num_rows: 1319
    })
})
----------------------------
{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?', 'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10', 'messages': [1, 4093, 198, 71, 1059, 38668, 1885, 33, 34, 354, 5353, 327, 3383, 672, 9584, 30, 718, 15955, 28, 1041, 915, 1250, 216, 37, 32, 3487, 282, 3383, 672, 9584, 30, 1073, 1083, 1250, 1041, 5301, 47, 2, 198, 1, 520, 9531, 198, 71, 1059, 38668, 216, 33, 34, 31, 38, 32, 446, 1885, 33691, 33, 34, 31, 38, 32, 45, 32, 30, 34, 7791, 32, 30, 34, 567, 8427, 30, 198, 23830, 216, 37, 32, 3487, 28, 1041, 11420, 216, 32, 30, 34, 1792, 216, 37, 32, 446, 1885, 33691, 32, 30, 34, 26, 37

'<|im_start|>user\nWeng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?<|im_end|>\n<|im_start|>assistant\nWeng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10<|im_end|>\n'