[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/timz815/360-NLP-Project/blob/main/optuna.ipynb)

In [1]:
import os, shutil, tempfile
# wipe any old compiled pieces
shutil.rmtree(os.path.expanduser("~/unsloth_compiled_cache"), ignore_errors=True)

# give Inductor a brand-new temp directory
os.environ["TMPDIR"] = tempfile.mkdtemp(prefix="torchinductor_")
os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.environ["TMPDIR"]
# disable the dynamo compile that crashes on missing tmp files
os.environ["TORCH_COMPILE_DISABLE"] = "1"

# Unsloth settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["UNSLOTH_DISABLE_FUSED_LOSS"] = "1"   # normal CE loss
os.environ["UNSLOTH_FREE_GB"] = "4"
os.environ["UNSLOTH_DISABLE_COMPILED_CACHE"] = "1"

In [2]:
import torch, gc, optuna
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


W1205 02:41:06.630000 38708 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.



ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
jsonl_path = r"C:\Users\timot\Downloads\nlp training\movie_dialogue.jsonl"
data = [json.loads(line) for line in open(jsonl_path, encoding="utf-8")]
ds = Dataset.from_list(data)

splits = ds.train_test_split(test_size=0.2, seed=42)
test_ds = splits["test"]
train_val = splits["train"]

val_splits = train_val.train_test_split(test_size=0.125, seed=42)
train_ds = val_splits["train"]
val_ds   = val_splits["test"]

print("Train:", len(train_ds))
print("Val:",   len(val_ds))
print("Test:",  len(test_ds))

Train: 3679
Val: 526
Test: 1052


In [4]:
model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit"
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    device_map={"": 0},          # whole model on GPU-0
)
print("Base model in VRAM:", torch.cuda.memory_allocated()/1024**3, "GB")

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE_TORCH}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.11.4: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model in VRAM: 3.3413352966308594 GB


In [5]:
def format_example(ex, tokenizer):
    messages = [
        {"role": "user",      "content": ex["chinese"]},
        {"role": "assistant", "content": ex["english"]},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"text": text}

def tokenize_example(ex, tokenizer, max_length=2048):
    t = tokenizer(ex["text"], truncation=True, max_length=max_length)
    return {"input_ids": t["input_ids"], "attention_mask": t["attention_mask"]}

train_ds = train_ds.map(format_example, fn_kwargs={"tokenizer": tokenizer}) \
                   .map(tokenize_example, fn_kwargs={"tokenizer": tokenizer})
val_ds   = val_ds.map(format_example, fn_kwargs={"tokenizer": tokenizer}) \
                 .map(tokenize_example, fn_kwargs={"tokenizer": tokenizer})

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3679/3679 [00:00<00:00, 8852.71 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3679/3679 [00:00<00:00, 8334.10 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 526/526 [00:00<00:00, 9275.45 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 526/526 [00:00<00:00, 9121.07 examples/s]


In [6]:
def objective(trial):
    lr   = trial.suggest_float("lr", 1e-5, 5e-4, log=True)
    bs   = trial.suggest_categorical("per_device_train_batch_size", [1, 2])
    ga   = trial.suggest_categorical("gradient_accumulation_steps", [2, 4])
    r    = trial.suggest_categorical("lora_r", [8, 16])
    alpha= trial.suggest_categorical("lora_alpha", [16, 32])
    warmup= trial.suggest_int("warmup_steps", 10, 30, step=10)
    max_steps = trial.suggest_int("max_steps", 100, 300, step=100)

    print(f"\nTrial {trial.number}  lr={lr:.2e}  r={r}  bs={bs}  ga={ga}")

    model = FastLanguageModel.get_peft_model(
        base_model,
        r=r,
        lora_alpha=alpha,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.0,
        bias="none",
        use_gradient_checkpointing=True,
    )

    args = SFTConfig(
        per_device_train_batch_size=bs,
        gradient_accumulation_steps=ga,
        warmup_steps=warmup,
        max_steps=max_steps,
        learning_rate=lr,
        logging_steps=10,
        output_dir=f"./optuna_trial_{trial.number}",
        optim="adamw_8bit",
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        eval_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        load_best_model_at_end=False,
        report_to="none",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        args=args,
    )
    trainer.train()
    eval_loss = trainer.evaluate().get("eval_loss", 9999.0)

    # clean-up
    del trainer, model
    gc.collect(); torch.cuda.empty_cache()
    return eval_loss

In [7]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=10, gc_after_trial=True)

print("\n===== BEST =====")
print("value :", study.best_value)
print("params:", study.best_params)

[I 2025-12-05 02:41:38,385] A new study created in memory with name: no-name-fd7eba12-bf83-46e0-a483-5e94d6e2c388



Trial 0  lr=4.33e-05  r=8  bs=1  ga=2


Unsloth 2025.11.4 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,7.6636,7.212988
40,3.8129,3.463493
60,2.6587,2.771253
80,2.653,2.488064
100,2.5978,2.446882


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


[I 2025-12-05 02:45:33,653] Trial 0 finished with value: 2.4468817710876465 and parameters: {'lr': 4.3284502212938785e-05, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 16, 'warmup_steps': 30, 'max_steps': 100}. Best is trial 0 with value: 2.4468817710876465.



Trial 1  lr=4.44e-04  r=16  bs=1  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,2.946,2.370627
40,1.8413,1.841933
60,1.7459,1.805347
80,2.4615,2.176434
100,1.7233,1.769049


[I 2025-12-05 02:52:44,510] Trial 1 finished with value: 1.769048810005188 and parameters: {'lr': 0.00044447541666908124, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 4, 'lora_r': 16, 'lora_alpha': 16, 'warmup_steps': 20, 'max_steps': 100}. Best is trial 1 with value: 1.769048810005188.



Trial 2  lr=3.14e-05  r=16  bs=2  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,5.6769,4.629837
40,3.1107,2.957372
60,2.4154,2.48415
80,2.2418,2.385406
100,2.2933,2.357493


[I 2025-12-05 02:57:48,683] Trial 2 finished with value: 2.3574929237365723 and parameters: {'lr': 3.135775732257744e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 2, 'lora_r': 16, 'lora_alpha': 32, 'warmup_steps': 10, 'max_steps': 100}. Best is trial 1 with value: 1.769048810005188.



Trial 3  lr=4.09e-04  r=8  bs=1  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,2.5309,2.026032
40,1.7958,1.931374
60,1.6712,1.863159
80,2.0292,1.830683
100,1.9257,1.850263
120,1.4823,1.823584
140,1.7418,1.780307
160,1.5466,1.788646
180,1.5411,1.771007
200,1.6177,1.746769


[I 2025-12-05 03:11:40,319] Trial 3 finished with value: 1.703627347946167 and parameters: {'lr': 0.0004093813608598784, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 32, 'warmup_steps': 10, 'max_steps': 300}. Best is trial 3 with value: 1.703627347946167.



Trial 4  lr=2.75e-05  r=16  bs=1  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,7.4465,6.905418
40,3.829,3.352838
60,2.5693,2.536664
80,2.219,2.333265
100,2.1933,2.225766
120,2.1088,2.121711
140,2.2281,2.019534
160,1.8933,1.947654
180,1.9298,1.903644
200,2.0027,1.888188


[I 2025-12-05 03:25:31,592] Trial 4 finished with value: 1.8881875276565552 and parameters: {'lr': 2.7520696850790512e-05, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 4, 'lora_r': 16, 'lora_alpha': 32, 'warmup_steps': 30, 'max_steps': 200}. Best is trial 3 with value: 1.703627347946167.



Trial 5  lr=3.68e-04  r=8  bs=2  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,2.6004,2.185316
40,1.7209,1.814971
60,1.7529,1.778296
80,1.6445,1.733266
100,1.744,1.720058
120,1.779,1.698331
140,1.5646,1.719919
160,1.6709,1.676437
180,1.6394,1.66717
200,1.7073,1.662485


[I 2025-12-05 03:39:15,122] Trial 5 finished with value: 1.6624853610992432 and parameters: {'lr': 0.00036832964384234194, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4, 'lora_r': 8, 'lora_alpha': 16, 'warmup_steps': 10, 'max_steps': 200}. Best is trial 5 with value: 1.6624853610992432.



Trial 6  lr=1.74e-05  r=8  bs=1  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,7.9967,8.231638
40,6.1118,5.979332
60,3.9976,3.930169
80,3.495,3.269776
100,3.0751,2.901547
120,2.2462,2.608244
140,2.3646,2.492792
160,2.2295,2.433844
180,2.1257,2.396983
200,2.224,2.369366


[I 2025-12-05 03:51:47,948] Trial 6 finished with value: 2.3113327026367188 and parameters: {'lr': 1.7355056469855084e-05, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 16, 'warmup_steps': 30, 'max_steps': 300}. Best is trial 5 with value: 1.6624853610992432.



Trial 7  lr=1.34e-05  r=8  bs=1  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,7.9165,8.011992
40,5.5471,5.407283
60,3.4794,3.524139
80,3.3138,3.101362
100,2.9456,2.762188
120,2.1883,2.572223
140,2.356,2.492131
160,2.2339,2.450807
180,2.1441,2.428765
200,2.2638,2.423023


[I 2025-12-05 04:00:03,772] Trial 7 finished with value: 2.423023223876953 and parameters: {'lr': 1.3359790328445548e-05, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 32, 'warmup_steps': 30, 'max_steps': 200}. Best is trial 5 with value: 1.6624853610992432.



Trial 8  lr=3.22e-04  r=16  bs=1  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,2.8337,2.327387
40,1.8377,1.841142
60,1.7461,1.801525
80,1.6124,1.77213
100,1.6994,1.753417


[I 2025-12-05 04:06:37,355] Trial 8 finished with value: 1.7534172534942627 and parameters: {'lr': 0.0003216235469207422, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 4, 'lora_r': 16, 'lora_alpha': 32, 'warmup_steps': 20, 'max_steps': 100}. Best is trial 5 with value: 1.6624853610992432.



Trial 9  lr=1.53e-05  r=8  bs=2  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,679 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,6.971,6.243304
40,4.2664,3.924175
60,3.2751,3.237764
80,2.8248,2.973485
100,2.9041,2.874983


[I 2025-12-05 04:13:14,681] Trial 9 finished with value: 2.8749828338623047 and parameters: {'lr': 1.5251209898002918e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4, 'lora_r': 8, 'lora_alpha': 32, 'warmup_steps': 10, 'max_steps': 100}. Best is trial 5 with value: 1.6624853610992432.



===== BEST =====
value : 1.6624853610992432
params: {'lr': 0.00036832964384234194, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4, 'lora_r': 8, 'lora_alpha': 16, 'warmup_steps': 10, 'max_steps': 200}
