In [3]:
from transformers import AutoTokenizer

# These will use different templates automatically
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"},
]

# Each will format according to its model's template
mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False)
smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False)

In [4]:
mistral_chat

'<s> [INST] You are a helpful assistant.\n\nHello! [/INST]'

In [5]:
smol_chat

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n'

In [7]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceTB/smoltalk", "systemchats-30k")

data/systemchats-30k/train-00000-of-0000(…):   0%|          | 0.00/47.9M [00:00<?, ?B/s]

data/systemchats-30k/test-00000-of-00001(…):   0%|          | 0.00/2.54M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34133 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1797 [00:00<?, ? examples/s]

In [17]:
def process_function(examples):
    processed_messages = mistral_tokenizer.apply_chat_template(
        examples["messages"], tokenize=False, add_generation_prompt=True
    )
    return {"mistral_messages": processed_messages}

processed_dataset = dataset.map(
    process_function,
    batched=True,
)

Map:   0%|          | 0/34133 [00:00<?, ? examples/s]

Map:   0%|          | 0/1797 [00:00<?, ? examples/s]

In [20]:
processed_dataset["train"][0]["messages"]

[{'content': 'Always respond with an emoji at the end', 'role': 'system'},
 {'content': 'Can you help me draft a tweet about the beautiful sunset I saw today at the beach? I want to include a sense of tranquility and inspiration in the message.',
  'role': 'user'},
 {'content': '"Embraced by the tranquil whisper of the waves, I watched as the sky turned into a canvas of warm hues. A beautiful sunset at the beach today, painting a picture of inspiration and serenity. 🌅"',
  'role': 'assistant'},
 {'content': "That's a beautiful tweet, thank you! Can you now show me how to post it on Twitter?",
  'role': 'user'},
 {'content': "Sure! Here are the steps:\n\n1. Open the Twitter app or website. \n2. Click on the 'Tweet' button (it's a blue circle with a feather in it).\n3. Copy and paste the tweet I crafted into the text box that appears.\n4. When you're ready, click the 'Tweet' button to post it.\n\nRemember, Twitter has a character limit of 280 characters per tweet. Enjoy tweeting! 😊",
  '

In [22]:
processed_dataset["train"][0]["mistral_messages"]

'<s> [INST] Always respond with an emoji at the end\n\nCan you help me draft a tweet about the beautiful sunset I saw today at the beach? I want to include a sense of tranquility and inspiration in the message. [/INST] "Embraced by the tranquil whisper of the waves, I watched as the sky turned into a canvas of warm hues. A beautiful sunset at the beach today, painting a picture of inspiration and serenity. 🌅"</s> [INST] That\'s a beautiful tweet, thank you! Can you now show me how to post it on Twitter? [/INST] Sure! Here are the steps:\n\n1. Open the Twitter app or website. \n2. Click on the \'Tweet\' button (it\'s a blue circle with a feather in it).\n3. Copy and paste the tweet I crafted into the text box that appears.\n4. When you\'re ready, click the \'Tweet\' button to post it.\n\nRemember, Twitter has a character limit of 280 characters per tweet. Enjoy tweeting! 😊</s>'