diff --git a/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb index af07b7f9..a2746d82 100644 --- a/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb +++ b/nb/Kaggle-Magistral_(24B)-Reasoning-Conversational.ipynb @@ -1056,17 +1056,19 @@ ], "source": [ "messages = [\n", - " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n", + " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n", "]\n", - "text = tokenizer.apply_chat_template(\n", + "inputs = tokenizer.apply_chat_template(\n", " messages,\n", - " tokenize = False,\n", + " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", - ")\n", + " return_tensors = \"pt\",\n", + " return_dict = True,\n", + ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "_ = model.generate(\n", - " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n", + " **inputs,\n", " max_new_tokens = 1024, # Increase for longer outputs!\n", " temperature = 0.7, top_p = 0.95,\n", " streamer = TextStreamer(tokenizer, skip_prompt = True),\n", @@ -6419,4 +6421,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/nb/Magistral_(24B)-Reasoning-Conversational.ipynb b/nb/Magistral_(24B)-Reasoning-Conversational.ipynb index 1dc0682c..d8340bb3 100644 --- a/nb/Magistral_(24B)-Reasoning-Conversational.ipynb +++ b/nb/Magistral_(24B)-Reasoning-Conversational.ipynb @@ -1056,17 +1056,19 @@ ], "source": [ "messages = [\n", - " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n", + " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n", "]\n", - "text = tokenizer.apply_chat_template(\n", + "inputs = tokenizer.apply_chat_template(\n", " messages,\n", - " tokenize = False,\n", + " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", - ")\n", + " return_tensors = \"pt\",\n", + " return_dict = True,\n", + ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "_ = model.generate(\n", - " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n", + " **inputs,\n", " max_new_tokens = 1024, # Increase for longer outputs!\n", " temperature = 0.7, top_p = 0.95,\n", " streamer = TextStreamer(tokenizer, skip_prompt = True),\n", @@ -6419,4 +6421,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb b/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb index 2b2da4ca..f04bc592 100644 --- a/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb +++ b/original_template/Magistral_(24B)-Reasoning-Conversational.ipynb @@ -1033,17 +1033,19 @@ ], "source": [ "messages = [\n", - " {\"role\" : \"user\", \"content\" : \"Solve (x + 2)^2 = 0.\"}\n", + " {\"role\" : \"user\", \"content\" : [{\"type\": \"text\", \"text\": \"Solve (x + 2)^2 = 0.\"}]}\n", "]\n", - "text = tokenizer.apply_chat_template(\n", + "inputs = tokenizer.apply_chat_template(\n", " messages,\n", - " tokenize = False,\n", + " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", - ")\n", + " return_tensors = \"pt\",\n", + " return_dict = True,\n", + ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "_ = model.generate(\n", - " **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n", + " **inputs,\n", " max_new_tokens = 1024, # Increase for longer outputs!\n", " temperature = 0.7, top_p = 0.95,\n", " streamer = TextStreamer(tokenizer, skip_prompt = True),\n", diff --git a/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py b/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py index 74de3ec6..2daa00d5 100644 --- a/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py +++ b/python_scripts/Kaggle-Magistral_(24B)-Reasoning-Conversational.py @@ -7,34 +7,34 @@ # # Join Discord if you need help + ⭐ Star us on Github ⭐ # -# +# # To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme). -# +# # You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save) -# +# # ### News -# +# # Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker). -# +# # [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels! -# +# # Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM. -# +# # Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning). -# +# # Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). -# +# # # ### Installation -# +# # # In[ ]: -# -# +# +# # get_ipython().run_cell_magic('capture', '', 'import os\n\n!pip install pip3-autoremove\n!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128\n!pip install unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n') -# -# +# +# # # ### Unsloth # In[ ]: @@ -240,17 +240,19 @@ def generate_conversation(example): messages = [ - {"role" : "user", "content" : "Solve (x + 2)^2 = 0."} + {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]} ] -text = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( messages, - tokenize = False, + tokenize = True, add_generation_prompt = True, # Must add for generation -) + return_tensors = "pt", + return_dict = True, +).to("cuda") from transformers import TextStreamer _ = model.generate( - **tokenizer(text, return_tensors = "pt").to("cuda"), + **inputs, max_new_tokens = 1024, # Increase for longer outputs! temperature = 0.7, top_p = 0.95, streamer = TextStreamer(tokenizer, skip_prompt = True), @@ -260,7 +262,7 @@ def generate_conversation(example): # # ### Saving, loading finetuned models # To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save. -# +# # **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down! # In[21]: @@ -287,7 +289,7 @@ def generate_conversation(example): # ### Saving to float16 for VLLM -# +# # We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens. # In[ ]: @@ -316,12 +318,12 @@ def generate_conversation(example): # ### GGUF / llama.cpp Conversion # To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF. -# +# # Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)): # * `q8_0` - Fast conversion. High resource use, but generally acceptable. # * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K. # * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K. -# +# # [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb) # In[24]: @@ -358,22 +360,22 @@ def generate_conversation(example): # Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp. -# +# # And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord! -# +# # Some other links: # 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb) # 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb) # 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb) # 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)! -# +# #
# # # -# +# # Join Discord if you need help + ⭐️ Star us on Github ⭐️ #
-# +# # This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme). -# +# diff --git a/python_scripts/Magistral_(24B)-Reasoning-Conversational.py b/python_scripts/Magistral_(24B)-Reasoning-Conversational.py index ddab31ae..51a92d60 100644 --- a/python_scripts/Magistral_(24B)-Reasoning-Conversational.py +++ b/python_scripts/Magistral_(24B)-Reasoning-Conversational.py @@ -7,34 +7,34 @@ # # Join Discord if you need help + ⭐ Star us on Github ⭐ # -# +# # To install Unsloth your local device, follow [our guide](https://docs.unsloth.ai/get-started/install-and-update). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme). -# +# # You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save) -# +# # ### News -# +# # Unsloth's [Docker image](https://hub.docker.com/r/unsloth/unsloth) is here! Start training with no setup & environment issues. [Read our Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker). -# +# # [gpt-oss RL](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) is now supported with the fastest inference & lowest VRAM. Try our [new notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) which creates kernels! -# +# # Introducing [Vision](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) and [Standby](https://docs.unsloth.ai/basics/memory-efficient-rl) for RL! Train Qwen, Gemma etc. VLMs with GSPO - even faster with less VRAM. -# +# # Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning). -# +# # Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). -# +# # # ### Installation -# +# # # In[ ]: -# -# +# +# # get_ipython().run_cell_magic('capture', '', 'import os, re\nif "COLAB_" not in "".join(os.environ.keys()):\n !pip install unsloth\nelse:\n # Do this only in Colab notebooks! Otherwise use pip install unsloth\n import torch; v = re.match(r"[0-9]{1,}\\.[0-9]{1,}", str(torch.__version__)).group(0)\n xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")\n !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo\n !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer\n !pip install --no-deps unsloth\n!pip install transformers==4.56.2\n!pip install --no-deps trl==0.22.2\n') -# -# +# +# # # ### Unsloth # In[ ]: @@ -240,17 +240,19 @@ def generate_conversation(example): messages = [ - {"role" : "user", "content" : "Solve (x + 2)^2 = 0."} + {"role" : "user", "content" : [{"type": "text", "text": "Solve (x + 2)^2 = 0."}]} ] -text = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( messages, - tokenize = False, + tokenize = True, add_generation_prompt = True, # Must add for generation -) + return_tensors = "pt", + return_dict = True, +).to("cuda") from transformers import TextStreamer _ = model.generate( - **tokenizer(text, return_tensors = "pt").to("cuda"), + **inputs, max_new_tokens = 1024, # Increase for longer outputs! temperature = 0.7, top_p = 0.95, streamer = TextStreamer(tokenizer, skip_prompt = True), @@ -260,7 +262,7 @@ def generate_conversation(example): # # ### Saving, loading finetuned models # To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save. -# +# # **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down! # In[21]: @@ -287,7 +289,7 @@ def generate_conversation(example): # ### Saving to float16 for VLLM -# +# # We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens. # In[ ]: @@ -316,12 +318,12 @@ def generate_conversation(example): # ### GGUF / llama.cpp Conversion # To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF. -# +# # Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)): # * `q8_0` - Fast conversion. High resource use, but generally acceptable. # * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K. # * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K. -# +# # [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb) # In[24]: @@ -358,22 +360,22 @@ def generate_conversation(example): # Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp. -# +# # And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord! -# +# # Some other links: # 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb) # 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb) # 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb) # 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)! -# +# #
# # # -# +# # Join Discord if you need help + ⭐️ Star us on Github ⭐️ #
-# +# # This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme). -# +#