diff --git a/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb
index af07b7f9..a2746d82 100644
--- a/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb
+++ b/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1056,17 +1056,19 @@
],
"source": [
"messages = [\n",
- " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+ " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
"]\n",
- "text = tokenizer.apply_chat_template(\n",
+ "inputs = tokenizer.apply_chat_template(\n",
" messages,\n",
- " tokenize = False,\n",
+ " tokenize = True,\n",
" add_generation_prompt = True, # Must add for generation\n",
- ")\n",
+ " return_tensors = \"pt\",\n",
+ " return_dict = True,\n",
+ ").to(\"cuda\")\n",
"\n",
"from transformers import TextStreamer\n",
"_ = model.generate(\n",
- " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+ " **inputs,\n",
" max_new_tokens = 1024, # Increase for longer outputs!\n",
" temperature = 0.7, top_p = 0.95,\n",
" streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
@@ -6419,4 +6421,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/nb/Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Magistral_(24B)-Reasoning-Conversational.ipynb
index 1dc0682c..d8340bb3 100644
--- a/nb/Magistral_(24B)-Reasoning-Conversational.ipynb
+++ b/nb/Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1056,17 +1056,19 @@
],
"source": [
"messages = [\n",
- " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+ " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
"]\n",
- "text = tokenizer.apply_chat_template(\n",
+ "inputs = tokenizer.apply_chat_template(\n",
" messages,\n",
- " tokenize = False,\n",
+ " tokenize = True,\n",
" add_generation_prompt = True, # Must add for generation\n",
- ")\n",
+ " return_tensors = \"pt\",\n",
+ " return_dict = True,\n",
+ ").to(\"cuda\")\n",
"\n",
"from transformers import TextStreamer\n",
"_ = model.generate(\n",
- " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+ " **inputs,\n",
" max_new_tokens = 1024, # Increase for longer outputs!\n",
" temperature = 0.7, top_p = 0.95,\n",
" streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
@@ -6419,4 +6421,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb b/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb
index 2b2da4ca..f04bc592 100644
--- a/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb
+++ b/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb
@@ -1033,17 +1033,19 @@
],
"source": [
"messages = [\n",
- " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n",
+ " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n",
"]\n",
- "text = tokenizer.apply_chat_template(\n",
+ "inputs = tokenizer.apply_chat_template(\n",
" messages,\n",
- " tokenize = False,\n",
+ " tokenize = True,\n",
" add_generation_prompt = True, # Must add for generation\n",
- ")\n",
+ " return_tensors = \"pt\",\n",
+ " return_dict = True,\n",
+ ").to(\"cuda\")\n",
"\n",
"from transformers import TextStreamer\n",
"_ = model.generate(\n",
- " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
+ " **inputs,\n",
" max_new_tokens = 1024, # Increase for longer outputs!\n",
" temperature = 0.7, top_p = 0.95,\n",
" streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
diff --git a/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py b/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py
index 74de3ec6..2daa00d5 100644
--- a/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py
+++ b/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py
@@ -7,34 +7,34 @@
#
#
Join Discord if you need help + ⭐ Star us on Github ⭐
#
-#
+#
# To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-#
+#
# You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)
-#
+#
# ### News
-#
+#
# Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker).
-#
+#
# [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels!
-#
+#
# Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM.
-#
+#
# Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).
-#
+#
# Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
-#
+#
# # ### Installation
-#
+#
# # In[ ]:
-#
-#
+#
+#
# get_ipython().run_cell_magic('capture', '', 'import os\n\n!pip install pip3-autoremove\n!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128\n!pip install unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n')
-#
-#
+#
+#
# # ### Unsloth
# In[ ]:
@@ -240,17 +240,19 @@ def generate_conversation(example):
messages = [
- {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
+ {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]}
]
-text = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
messages,
- tokenize = False,
+ tokenize = True,
add_generation_prompt = True, # Must add for generation
-)
+ return_tensors = "pt",
+ return_dict = True,
+).to("cuda")
from transformers import TextStreamer
_ = model.generate(
- **tokenizer(text, return_tensors = "pt").to("cuda"),
+ **inputs,
max_new_tokens = 1024, # Increase for longer outputs!
temperature = 0.7, top_p = 0.95,
streamer = TextStreamer(tokenizer, skip_prompt = True),
@@ -260,7 +262,7 @@ def generate_conversation(example):
#
# ### Saving, loading finetuned models
# To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
-#
+#
# **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
# In[21]:
@@ -287,7 +289,7 @@ def generate_conversation(example):
# ### Saving to float16 for VLLM
-#
+#
# We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.
# In[ ]:
@@ -316,12 +318,12 @@ def generate_conversation(example):
# ### GGUF / llama.cpp Conversion
# To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.
-#
+#
# Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
# * `q8_0` - Fast conversion. High resource use, but generally acceptable.
# * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
# * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
-#
+#
# [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# In[24]:
@@ -358,22 +360,22 @@ def generate_conversation(example):
# Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp.
-#
+#
# And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
-#
+#
# Some other links:
# 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
# 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
# 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
-#
+#
#
#
Join Discord if you need help + ⭐ Star us on Github ⭐
#
-#
+#
# To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-#
+#
# You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)
-#
+#
# ### News
-#
+#
# Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker).
-#
+#
# [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels!
-#
+#
# Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM.
-#
+#
# Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).
-#
+#
# Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
-#
+#
# # ### Installation
-#
+#
# # In[ ]:
-#
-#
+#
+#
# get_ipython().run_cell_magic('capture', '', 'import os, re\nif "COLAB_" not in "".join(os.environ.keys()):\n !pip install unsloth\nelse:\n # Do this only in Colab notebooks! Otherwise use pip install unsloth\n import torch; v = re.match(r"[0-9]{1,}\\.[0-9]{1,}", str(torch.__version__)).group(0)\n xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")\n !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo\n !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer\n !pip install --no-deps unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n')
-#
-#
+#
+#
# # ### Unsloth
# In[ ]:
@@ -240,17 +240,19 @@ def generate_conversation(example):
messages = [
- {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
+ {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]}
]
-text = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
messages,
- tokenize = False,
+ tokenize = True,
add_generation_prompt = True, # Must add for generation
-)
+ return_tensors = "pt",
+ return_dict = True,
+).to("cuda")
from transformers import TextStreamer
_ = model.generate(
- **tokenizer(text, return_tensors = "pt").to("cuda"),
+ **inputs,
max_new_tokens = 1024, # Increase for longer outputs!
temperature = 0.7, top_p = 0.95,
streamer = TextStreamer(tokenizer, skip_prompt = True),
@@ -260,7 +262,7 @@ def generate_conversation(example):
#
# ### Saving, loading finetuned models
# To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
-#
+#
# **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
# In[21]:
@@ -287,7 +289,7 @@ def generate_conversation(example):
# ### Saving to float16 for VLLM
-#
+#
# We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.
# In[ ]:
@@ -316,12 +318,12 @@ def generate_conversation(example):
# ### GGUF / llama.cpp Conversion
# To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.
-#
+#
# Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
# * `q8_0` - Fast conversion. High resource use, but generally acceptable.
# * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
# * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
-#
+#
# [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# In[24]:
@@ -358,22 +360,22 @@ def generate_conversation(example):
# Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp.
-#
+#
# And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
-#
+#
# Some other links:
# 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
# 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
# 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
-#
+#
#
-#
+#
# This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
-#
+#