[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/timz815/360-NLP-Project/blob/main/optuna.ipynb)

In [1]:
import os, shutil, tempfile
# wipe any old compiled pieces
shutil.rmtree(os.path.expanduser("~/unsloth_compiled_cache"), ignore_errors=True)

# give Inductor a brand-new temp directory
os.environ["TMPDIR"] = tempfile.mkdtemp(prefix="torchinductor_")
os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.environ["TMPDIR"]
# disable the dynamo compile that crashes on missing tmp files
os.environ["TORCH_COMPILE_DISABLE"] = "1"

# Unsloth settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["UNSLOTH_DISABLE_FUSED_LOSS"] = "1"
os.environ["UNSLOTH_FREE_GB"] = "4"
os.environ["UNSLOTH_DISABLE_COMPILED_CACHE"] = "1"

In [2]:
import torch, gc, optuna
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


W1205 16:06:41.258000 6264 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.



ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
jsonl_path = r"C:\Users\timot\Downloads\nlp training\movie_dialogue.jsonl"
data = [json.loads(line) for line in open(jsonl_path, encoding="utf-8")]
ds = Dataset.from_list(data)

splits = ds.train_test_split(test_size=0.2, seed=42)
test_ds = splits["test"]
train_val = splits["train"]

val_splits = train_val.train_test_split(test_size=0.125, seed=42)
train_ds = val_splits["train"]
val_ds   = val_splits["test"]

print("Train:", len(train_ds))
print("Val:",   len(val_ds))
print("Test:",  len(test_ds))

Train: 6608
Val: 944
Test: 1889


In [4]:
model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit"
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    device_map={"": 0},
)
print("Base model in VRAM:", torch.cuda.memory_allocated()/1024**3, "GB")

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE_TORCH}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.11.4: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model in VRAM: 3.3413352966308594 GB


In [5]:
def format_example(ex, tokenizer):
    messages = [
        {"role": "user",      "content": ex["chinese"]},
        {"role": "assistant", "content": ex["english"]},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"text": text}

def tokenize_example(ex, tokenizer, max_length=2048):
    t = tokenizer(ex["text"], truncation=True, max_length=max_length)
    return {"input_ids": t["input_ids"], "attention_mask": t["attention_mask"]}

train_ds = train_ds.map(format_example, fn_kwargs={"tokenizer": tokenizer}) \
                   .map(tokenize_example, fn_kwargs={"tokenizer": tokenizer})
val_ds   = val_ds.map(format_example, fn_kwargs={"tokenizer": tokenizer}) \
                 .map(tokenize_example, fn_kwargs={"tokenizer": tokenizer})

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6608/6608 [00:01<00:00, 3745.01 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6608/6608 [00:01<00:00, 3983.72 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 944/944 [00:00<00:00, 3037.80 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 944/944 [00:00<00:00, 4765.45 examples/s]


In [6]:
def objective(trial):
    lr   = trial.suggest_float("lr", 3e-4, 5e-4, log=True)          # focus on best range
    bs   = trial.suggest_categorical("per_device_train_batch_size", [1, 2])
    ga   = trial.suggest_categorical("gradient_accumulation_steps", [2, 4])
    r    = trial.suggest_categorical("lora_r", [8, 16])
    alpha = trial.suggest_categorical("lora_alpha", [16, 32])
    # fixed small warm-up / steps
    warmup = 10
    max_steps = 100

    print(f"\nTrial {trial.number}  lr={lr:.2e}  r={r}  bs={bs}  ga={ga}")

    model = FastLanguageModel.get_peft_model(
        base_model,
        r=r, lora_alpha=alpha,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.0, bias="none",
        use_gradient_checkpointing=True,
    )

    args = SFTConfig(
        per_device_train_batch_size=bs,
        gradient_accumulation_steps=ga,
        warmup_steps=warmup,
        max_steps=max_steps,
        learning_rate=lr,
        logging_steps=20,
        output_dir=f"./fast_trial_{trial.number}",
        optim="adamw_8bit",
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        eval_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        load_best_model_at_end=False,
        report_to="none",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        args=args,
    )
    trainer.train()
    loss = trainer.evaluate().get("eval_loss", 9999.0)

    del trainer, model
    gc.collect(); torch.cuda.empty_cache()
    return loss

In [7]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=6, gc_after_trial=True)

print("\n===== BEST =====")
print("value :", study.best_value)
print("params:", study.best_params)

[I 2025-12-05 16:07:39,059] A new study created in memory with name: no-name-0f155d7c-420b-4df8-ac2a-b8ee915a6865



Trial 0  lr=3.63e-04  r=8  bs=1  ga=2


Unsloth 2025.11.4 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,4.6537,2.22031
40,1.9148,1.93526
60,1.6227,1.743312
80,1.6066,1.73535
100,1.9093,1.708796


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


[I 2025-12-05 16:14:39,871] Trial 0 finished with value: 1.7087962627410889 and parameters: {'lr': 0.00036325576193223686, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 16}. Best is trial 0 with value: 1.7087962627410889.



Trial 1  lr=4.31e-04  r=16  bs=2  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,3.984,1.829086
40,1.7054,1.765684
60,1.8022,1.714928
80,1.6444,1.685344
100,1.7314,1.656623


[I 2025-12-05 16:21:21,998] Trial 1 finished with value: 1.656623363494873 and parameters: {'lr': 0.00043073114022763095, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 2, 'lora_r': 16, 'lora_alpha': 32}. Best is trial 1 with value: 1.656623363494873.



Trial 2  lr=3.74e-04  r=16  bs=2  ga=4


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,4.3793,2.142414
40,1.8355,1.726653
60,1.7197,1.667916
80,1.6834,1.645233
100,1.6854,1.636911


[I 2025-12-05 16:30:25,211] Trial 2 finished with value: 1.6369105577468872 and parameters: {'lr': 0.00037406555329177457, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4, 'lora_r': 16, 'lora_alpha': 16}. Best is trial 2 with value: 1.6369105577468872.



Trial 3  lr=3.90e-04  r=16  bs=1  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Step,Training Loss,Validation Loss
20,4.6164,2.196567
40,1.9037,1.797203
60,1.6058,1.743973
80,1.6059,1.737537
100,1.8917,1.702291


[I 2025-12-05 16:37:09,690] Trial 3 finished with value: 1.7022908926010132 and parameters: {'lr': 0.0003901247666389379, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 2, 'lora_r': 16, 'lora_alpha': 16}. Best is trial 2 with value: 1.6369105577468872.



Trial 4  lr=3.51e-04  r=8  bs=2  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,4.5939,2.200164
40,1.7994,1.760895
60,1.7863,1.71644
80,1.6389,1.693129
100,1.7387,1.673455


[I 2025-12-05 16:43:39,696] Trial 4 finished with value: 1.6734552383422852 and parameters: {'lr': 0.00035050921365405055, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 16}. Best is trial 2 with value: 1.6369105577468872.



Trial 5  lr=4.21e-04  r=8  bs=2  ga=2


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,608 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 16,515,072 of 4,038,983,168 (0.41% trained)


Step,Training Loss,Validation Loss
20,4.4098,2.115174
40,1.7453,1.755527
60,1.7836,1.707507
80,1.6338,1.683467
100,1.7336,1.663672


[I 2025-12-05 16:50:11,547] Trial 5 finished with value: 1.6636719703674316 and parameters: {'lr': 0.0004208244796565418, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 2, 'lora_r': 8, 'lora_alpha': 16}. Best is trial 2 with value: 1.6369105577468872.



===== BEST =====
value : 1.6369105577468872
params: {'lr': 0.00037406555329177457, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4, 'lora_r': 16, 'lora_alpha': 16}
