In [1]:
pip install unsloth transformers trl

Collecting unsloth
  Downloading unsloth-2026.1.3-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.27.0-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2026.1.3 (from unsloth)
  Downloading unsloth_zoo-2026.1.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.5-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.

In [16]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True
 )

==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model, r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

Unsloth 2026.1.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

In [6]:
# Load the full dataset first
dataset = load_dataset("mlabonne/FineTome-100k")

# Split into train and validation
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [7]:
# Apply standardization separately to train and validation
train_dataset = standardize_sharegpt(train_dataset)
val_dataset = standardize_sharegpt(val_dataset)


Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/90000 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 10000
    })
})

In [9]:
train_dataset[0]
val_dataset[0]


{'conversations': [{'content': 'Write Python code to solve the task:\nRecall that string a is a subsequence of a string b if a can be obtained from b by deletion of several (possibly zero or all) characters. For example, for the string a="wowwo", the following strings are subsequences: "wowwo", "wowo", "oo", "wow", "", and others, but the following are not subsequences: "owoo", "owwwo", "ooo".\n\nThe wow factor of a string is the number of its subsequences equal to the word "wow". Bob wants to write a string that has a large wow factor. However, the "w" key on his keyboard is broken, so he types two "v"s instead. \n\nLittle did he realise that he may have introduced more "w"s than he thought. Consider for instance the string "ww". Bob would type it as "vvvv", but this string actually contains three occurrences of "w": \n\n  * "vvvv" \n  * "vvvv" \n  * "vvvv" \n\n\n\nFor example, the wow factor of the word "vvvovvv" equals to four because there are four wows:\n\n  * "vvvovvv" \n  * "vvv

In [10]:
train_dataset = train_dataset.map(
    lambda examples: {
        "text": [
            tokenizer.apply_chat_template(convo, tokenize=False)
            for convo in examples["conversations"]
        ]
    },
    batched=True
)

val_dataset = val_dataset.map(
    lambda examples: {
        "text": [
            tokenizer.apply_chat_template(convo, tokenize=False)
            for convo in examples["conversations"]
        ]
    },
    batched=True
)


Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
train_dataset
val_dataset


Dataset({
    features: ['conversations', 'source', 'score', 'text'],
    num_rows: 10000
})

In [12]:
train_dataset[0]
val_dataset[0]


{'conversations': [{'content': 'Write Python code to solve the task:\nRecall that string a is a subsequence of a string b if a can be obtained from b by deletion of several (possibly zero or all) characters. For example, for the string a="wowwo", the following strings are subsequences: "wowwo", "wowo", "oo", "wow", "", and others, but the following are not subsequences: "owoo", "owwwo", "ooo".\n\nThe wow factor of a string is the number of its subsequences equal to the word "wow". Bob wants to write a string that has a large wow factor. However, the "w" key on his keyboard is broken, so he types two "v"s instead. \n\nLittle did he realise that he may have introduced more "w"s than he thought. Consider for instance the string "ww". Bob would type it as "vvvv", but this string actually contains three occurrences of "w": \n\n  * "vvvv" \n  * "vvvv" \n  * "vvvv" \n\n\n\nFor example, the wow factor of the word "vvvovvv" equals to four because there are four wows:\n\n  * "vvvovvv" \n  * "vvv

In [13]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [14]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4aad8f289e48e5d9c02df59c3e57e768d6a3c1d2749888c76a0c2025c7fca5af
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=2048,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="outputs"
    )
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/90000 [00:00<?, ? examples/s]

In [18]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 90,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,1.3026
2,1.4275
3,1.2132
4,1.4087
5,1.5081
6,1.2581
7,1.1865
8,1.351
9,0.9508
10,1.0967




0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/grad_norm,██▇▇▆▇▄▄▃▄▃▄▃▂▂▄▂▂▂▃▂▁▂▁▂▃▂▂▂▁▂▃▁▁▂▃▂▃▃▂
train/learning_rate,▁▂▄▇███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
train/loss,▆▇▅▇█▅▇▃▄▄▃▄▄▃▃▂▄▃▃▄▃▃▃▄▄▂▄▄▂▃▅▂▂▁▂▅▁▄▃▄

0,1
total_flos,5483719279632384.0
train/epoch,0.00533
train/global_step,60.0
train/grad_norm,0.20544
train/learning_rate,0.0
train/loss,1.0099
train_loss,1.03061
train_runtime,334.6011
train_samples_per_second,1.435
train_steps_per_second,0.179


TrainOutput(global_step=60, training_loss=1.0306070178747178, metrics={'train_runtime': 334.6011, 'train_samples_per_second': 1.435, 'train_steps_per_second': 0.179, 'total_flos': 5483719279632384.0, 'train_loss': 1.0306070178747178, 'epoch': 0.005333333333333333})

In [19]:
model.save_pretrained("finetuned_model1")

In [20]:
inference_model, inference_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./finetuned_model1",
    max_seq_length=2048,
    load_in_4bit=True
)

==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [21]:
import torch
import evaluate

small_val = val_dataset.select(range(50))

rouge_metric = evaluate.load("rouge")

all_preds = []
all_labels = []

for example in small_val:
    formatted_prompt = inference_tokenizer.apply_chat_template([{
        "role": "user",
        "content": example["text"]
    }], tokenize=False)

    model_inputs = inference_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        generated_ids = inference_model.generate(
            **model_inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            pad_token_id=inference_tokenizer.pad_token_id
        )

    response = inference_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Save prediction and reference
    all_preds.append(response)
    all_labels.append(example["text"])

rouge_results = rouge_metric.compute(predictions=all_preds, references=all_labels)

print("\nRouge Scores (small validation set):")
for key, value in rouge_results.items():
    print(f"{key}: {value*100:.2f}%")


Downloading builder script: 0.00B [00:00, ?B/s]

Unsloth: Input IDs of shape torch.Size([1, 2743]) with length 2743 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.



Rouge Scores (small validation set):
rouge1: 84.73%
rouge2: 82.99%
rougeL: 84.50%
rougeLsum: 84.70%


In [22]:
text_prompts = [
    "What are the key principles of investment?"
]

for prompt in text_prompts:
  formatted_prompt = inference_tokenizer.apply_chat_template([{
      "role": "user",
      "content": prompt
      }], tokenize=False)

  model_inputs = inference_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
  generated_ids = inference_model.generate(
      **model_inputs,
      max_new_tokens=512,
      temperature=0.7,
      do_sample=True,
      pad_token_id=inference_tokenizer.pad_token_id
  )
  response = inference_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  print(response)

system

Cutting Knowledge Date: December 2023
Today Date: 18 Jan 2026

user

What are the key principles of investment?assistant

The key principles of investment are:

1. Diversification: This principle involves spreading investments across different asset classes, sectors, and geographic regions to minimize risk. Diversification can help investors reduce their exposure to any single investment and potentially increase their overall returns.
2. Long-term perspective: Investing for the long-term is essential, as it allows investors to ride out market fluctuations and capture the power of compounding. A long-term perspective can help investors avoid making impulsive decisions based on short-term market trends.
3. Risk management: Investors should understand and manage the risks associated with their investments. This can involve setting stop-loss orders, diversifying portfolios, and regularly reviewing investment performance.
4. Dollar-cost averaging: This principle involves investing a