In [2]:
# !nvidia-smi || true
%pip -q install --upgrade pip
!pip install "unsloth>=0.3.0" "torch>=2.0.0" "transformers>=4.36.0" "datasets>=2.14.0" "trl>=0.7.4" "accelerate>=0.24.0" "bitsandbytes>=0.41.0" "scipy>=1.11.0" "click>=8.0.0" "wandb>=0.15.0"



In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Load model & tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="mistralai/Mistral-7B-v0.1",
    max_seq_length=2048,
    full_finetuning=True
)

# (Keeping your current PEFT call; remove this line if you truly want full finetuning)
# model = FastLanguageModel.get_peft_model(model)

# Use a chat template that expects a list of {"from": ..., "value": ...}
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)

# Load dataset with 'prompt' and 'response' columns
origdataset = load_dataset("alespalla/chatbot_instruction_prompts", split="train")
pr_dataset = origdataset.select_columns(["prompt", "response"])

# Convert each prompt/response pair into a 2-turn conversation, then to a single 'text' field
def to_chat_text(batch):
    conversations = []
    for p, r in zip(batch["prompt"], batch["response"]):
        conversations.append(
            [
                {"from": "human", "value": p},
                {"from": "gpt", "value": r},
            ]
        )
    texts = [
        tokenizer.apply_chat_template(
            conv, tokenize=False, add_generation_prompt=False
        )
        for conv in conversations
    ]
    return {"text": texts}

dataset = pr_dataset.map(
    to_chat_text,
    batched=True,
    batch_size=100,
    desc="Formatting prompt/response into chat template",
)

# Train
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    dataset_num_proc=2,
    max_seq_length=2048,
    packing=False,  # Can make training faster for short sequences if set True
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=500,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer.train()


Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.
==((====))==  Unsloth 2025.11.1: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using bfloat16 full finetuning which cuts memory usage by 50%.
To enable float32 training, use `float32_mixed_precision = True` during FastLanguageModel.from_pretrained


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Formatting prompt/response into chat template:   0%|          | 0/258042 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/258042 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 258,042 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 7,241,732,096 of 7,241,732,096 (100.00% trained)


Step,Training Loss
1,1.9452
2,2.1471
3,1.9887
4,4.9957
5,4.7791
6,10.6484
7,12.6497
8,7.584
9,7.4969
10,15.236


TrainOutput(global_step=500, training_loss=4.95949262046814, metrics={'train_runtime': 706.0173, 'train_samples_per_second': 5.666, 'train_steps_per_second': 0.708, 'total_flos': 2.481156758465741e+16, 'train_loss': 4.95949262046814, 'epoch': 0.015501352493005015})

Now that the model is trained, you can use it to generate responses based on new prompts. The following cell shows an example of how to do this.

In [6]:
# Example inference
prompt = "Hi" # Updated prompt

# Apply the chat template to the prompt
# We set add_generation_prompt=True to add the assistant's turn start token
input_text = tokenizer.apply_chat_template([{"from": "human", "value": prompt}], tokenize=False, add_generation_prompt=True)

# Tokenize the input text and include attention mask
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device) # Added padding and truncation

# Generate a response, passing the attention mask
outputs = model.generate(inputs.input_ids, max_new_tokens=100, use_cache=True, attention_mask=inputs.attention_mask) # Added attention_mask

# Decode the generated tokens back to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the original prompt and the generated response
print("Prompt:")
print(prompt)
print("\nGenerated Response:")
print(response)

Prompt:
Hi

Generated Response:
<|im_start|>user
Hi
<|im_start|>assistant
The most way to be a best way to be a best way to be a new way to be a new way to the local way.
