In [4]:
!pip install unsloth



In [5]:
import json
from datasets import Dataset

with open('people_data.json', 'r') as f:
  data = json.load(f)

  tuning_examples = []

  for example in data:
    tuning_examples.append(
      f"<|user|>\n{example['prompt']}\n<|assistant|>\n{json.dumps(example['response'])}<|endoftext|>"
      )

dataset = Dataset.from_dict({'text': tuning_examples})

In [1]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Phi-3-mini-4k-instruct-bnb-4bit',
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.3: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=['q_proj', 'k_proj', "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=64*2,
    lora_dropout=0,
    bias='none',
    use_gradient_checkpointing='unsloth'
)

Unsloth 2025.11.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    dataset_text_field='text',
    max_seq_length=2048,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps=60,
        num_train_epochs=3,
        logging_steps=1,
        output_dir='outputs',
        optim='adamw_8bit',
        report_to='none'  # avoid wandb
    )
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/300 [00:00<?, ? examples/s]

In [7]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 300 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Step,Training Loss
1,2.7847
2,2.7416
3,2.6257
4,2.6907
5,2.7871
6,2.4169
7,2.4766
8,2.281
9,2.1938
10,1.9435


TrainOutput(global_step=60, training_loss=1.1983629236618678, metrics={'train_runtime': 171.6755, 'train_samples_per_second': 2.796, 'train_steps_per_second': 0.349, 'total_flos': 822981377433600.0, 'train_loss': 1.1983629236618678, 'epoch': 1.5866666666666667})

In [8]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Mike is a 30 year old programmer. He loves hiking."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to('cuda')

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    do_sample=True,
    top_p=0.9
)

response = tokenizer.batch_decode(outputs)[0]

print(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|user|> Mike is a 30 year old programmer. He loves hiking.<|end|><|assistant|> {"name": "Mike", "age": "30", "job": "programmer", "gender": ""}<|end|>


In [None]:
model.save_pretrained_gguf(
    'finetuned_model',
    tokenizer,
    quantization_method='q4_k_m',
    maximum_memory_usage=0.3  # change this value based on the memory usage 
)