unslothai · danielhanchen · Nov 23, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025
diff --git a/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1056,17 +1056,19 @@
    ],
    "source": [
     "messages = [\n",
-    "    {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+    "    {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
     "]\n",
-    "text = tokenizer.apply_chat_template(\n",
+    "inputs = tokenizer.apply_chat_template(\n",
     "    messages,\n",
-    "    tokenize = False,\n",
+    "    tokenize = True,\n",
     "    add_generation_prompt = True, # Must add for generation\n",
-    ")\n",
+    "    return_tensors = \"pt\",\n",
+    "    return_dict = True,\n",
+    ").to(\"cuda\")\n",
     "\n",
     "from transformers import TextStreamer\n",
     "_ = model.generate(\n",
-    "    **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+    "    **inputs,\n",
     "    max_new_tokens = 1024, # Increase for longer outputs!\n",
     "    temperature = 0.7, top_p = 0.95,\n",
     "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
@@ -6419,4 +6421,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
diff --git a/nb/Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1056,17 +1056,19 @@
    ],
    "source": [
     "messages = [\n",
-    "    {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+    "    {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
     "]\n",
-    "text = tokenizer.apply_chat_template(\n",
+    "inputs = tokenizer.apply_chat_template(\n",
     "    messages,\n",
-    "    tokenize = False,\n",
+    "    tokenize = True,\n",
     "    add_generation_prompt = True, # Must add for generation\n",
-    ")\n",
+    "    return_tensors = \"pt\",\n",
+    "    return_dict = True,\n",
+    ").to(\"cuda\")\n",
     "\n",
     "from transformers import TextStreamer\n",
     "_ = model.generate(\n",
-    "    **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+    "    **inputs,\n",
     "    max_new_tokens = 1024, # Increase for longer outputs!\n",
     "    temperature = 0.7, top_p = 0.95,\n",
     "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
@@ -6419,4 +6421,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
diff --git a/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb b/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1033,17 +1033,19 @@
       ],
       "source": [
         "messages = [\n",
-        "    {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+        "    {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
         "]\n",
-        "text = tokenizer.apply_chat_template(\n",
+        "inputs = tokenizer.apply_chat_template(\n",
         "    messages,\n",
-        "    tokenize = False,\n",
+        "    tokenize = True,\n",
         "    add_generation_prompt = True, # Must add for generation\n",
-        ")\n",
+        "    return_tensors = \"pt\",\n",
+        "    return_dict = True,\n",
+        ").to(\"cuda\")\n",
         "\n",
         "from transformers import TextStreamer\n",
         "_ = model.generate(\n",
-        "    **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+        "    **inputs,\n",
         "    max_new_tokens = 1024, # Increase for longer outputs!\n",
         "    temperature = 0.7, top_p = 0.95,\n",
         "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",

diff --git a/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py b/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py
@@ -7,34 +7,34 @@
 # <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
 # <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐
 # </div>
-# 
+#
 # To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-# 
+#
 # You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)
-# 
+#
 
 # ### News
 
-# 
+#
 # Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker).
-# 
+#
 # [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels!
-# 
+#
 # Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM.
-# 
+#
 # Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).
-# 
+#
 # Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
-# 
+#
 
 # # ### Installation
-# 
+#
 # # In[ ]:
-# 
-# 
+#
+#
 # get_ipython().run_cell_magic('capture', '', 'import os\n\n!pip install pip3-autoremove\n!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128\n!pip install unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n')
-# 
-# 
+#
+#
 # # ### Unsloth
 
 # In[ ]:
@@ -240,17 +240,19 @@ def generate_conversation(example):
 
 
 messages = [
-    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
+    {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]}
 ]
-text = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     messages,
-    tokenize = False,
+    tokenize = True,
     add_generation_prompt = True, # Must add for generation
-)
+    return_tensors = "pt",
+    return_dict = True,
+).to("cuda")
 
 from transformers import TextStreamer
 _ = model.generate(
-    **tokenizer(text, return_tensors = "pt").to("cuda"),
+    **inputs,
     max_new_tokens = 1024, # Increase for longer outputs!
     temperature = 0.7, top_p = 0.95,
     streamer = TextStreamer(tokenizer, skip_prompt = True),
@@ -260,7 +262,7 @@ def generate_conversation(example):
 # <a name="Save"></a>
 # ### Saving, loading finetuned models
 # To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
-# 
+#
 # **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
 
 # In[21]:
@@ -287,7 +289,7 @@ def generate_conversation(example):
 
 
 # ### Saving to float16 for VLLM
-# 
+#
 # We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.
 
 # In[ ]:
@@ -316,12 +318,12 @@ def generate_conversation(example):
 
 # ### GGUF / llama.cpp Conversion
 # To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.
-# 
+#
 # Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
 # * `q8_0` - Fast conversion. High resource use, but generally acceptable.
 # * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
 # * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
-# 
+#
 # [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
 
 # In[24]:
@@ -358,22 +360,22 @@ def generate_conversation(example):
 
 
 # Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp.
-# 
+#
 # And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
-# 
+#
 # Some other links:
 # 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
 # 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
 # 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
 # 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
-# 
+#
 # <div class="align-center">
 #   <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
 #   <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
 #   <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>
-# 
+#
 #   Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
 # </div>
-# 
+#
 #   This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-# 
+#
diff --git a/python_scripts/Magistral_(24B)-Reasoning-Conversational.py b/python_scripts/Magistral_(24B)-Reasoning-Conversational.py
@@ -7,34 +7,34 @@
 # <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
 # <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐
 # </div>
-# 
+#
 # To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-# 
+#
 # You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)
-# 
+#
 
 # ### News
 
-# 
+#
 # Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker).
-# 
+#
 # [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels!
-# 
+#
 # Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM.
-# 
+#
 # Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).
-# 
+#
 # Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
-# 
+#
 
 # # ### Installation
-# 
+#
 # # In[ ]:
-# 
-# 
+#
+#
 # get_ipython().run_cell_magic('capture', '', 'import os, re\nif "COLAB_" not in "".join(os.environ.keys()):\n    !pip install unsloth\nelse:\n    # Do this only in Colab notebooks! Otherwise use pip install unsloth\n    import torch; v = re.match(r"[0-9]{1,}\\.[0-9]{1,}", str(torch.__version__)).group(0)\n    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")\n    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo\n    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer\n    !pip install --no-deps unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n')
-# 
-# 
+#
+#
 # # ### Unsloth
 
 # In[ ]:
@@ -240,17 +240,19 @@ def generate_conversation(example):
 
 
 messages = [
-    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
+    {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]}
 ]
-text = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     messages,
-    tokenize = False,
+    tokenize = True,
     add_generation_prompt = True, # Must add for generation
-)
+    return_tensors = "pt",
+    return_dict = True,
+).to("cuda")
 
 from transformers import TextStreamer
 _ = model.generate(
-    **tokenizer(text, return_tensors = "pt").to("cuda"),
+    **inputs,
     max_new_tokens = 1024, # Increase for longer outputs!
     temperature = 0.7, top_p = 0.95,
     streamer = TextStreamer(tokenizer, skip_prompt = True),
@@ -260,7 +262,7 @@ def generate_conversation(example):
 # <a name="Save"></a>
 # ### Saving, loading finetuned models
 # To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
-# 
+#
 # **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
 
 # In[21]:
@@ -287,7 +289,7 @@ def generate_conversation(example):
 
 
 # ### Saving to float16 for VLLM
-# 
+#
 # We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.
 
 # In[ ]:
@@ -316,12 +318,12 @@ def generate_conversation(example):
 
 # ### GGUF / llama.cpp Conversion
 # To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.
-# 
+#
 # Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
 # * `q8_0` - Fast conversion. High resource use, but generally acceptable.
 # * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
 # * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
-# 
+#
 # [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
 
 # In[24]:
@@ -358,22 +360,22 @@ def generate_conversation(example):
 
 
 # Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp.
-# 
+#
 # And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
-# 
+#
 # Some other links:
 # 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
 # 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
 # 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
 # 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
-# 
+#
 # <div class="align-center">
 #   <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
 #   <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
 #   <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>
-# 
+#
 #   Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
 # </div>
-# 
+#
 #   This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-# 
+#