Use FastLanguageModel and incremental clean URL etc.

In [1]:
from unsloth import FastLanguageModel
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_value_
import time
from collections import defaultdict
from utility import TTTDataset_iter
from torch.utils.data import DataLoader
from utility import load_grouped_data
from peft import PeftModel
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    # model_name = "unsloth/Qwen3-8B-Base-unsloth-bnb-4bit",
    model_name = "unsloth/Qwen3-4B-Base-unsloth-bnb-4bit",
    # model_name = "unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit",
    # model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    # model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    # model_name="unsloth/gemma-3-12b-pt",
    # model_name="unsloth/gemma-3-4b-pt",
    max_seq_length = 8192, # Choose any for long context!
    # resize_model_vocab = 80999, 
    load_in_4bit = True,
)
# model.model.embed_tokens.load_state_dict({'weight':torch.load('Model/reduced_embedding.pt')})

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-16 10:16:53 [__init__.py:235] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.11: Fast Qwen3 patching. Transformers: 4.54.1. vLLM: 0.10.0.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.635 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
# save the lm_head for yes and no
index = [tokenizer.encode(" No")[0], tokenizer.encode(" Yes")[0]]
print(index)
torch.save(model.lm_head.weight[index], './Model/Gwen_4B_lm_head.pth')
# lm_head_weight = nn.Parameter(torch.load('./Model/Gwen1_7B_lm_head.pth').T)
lm_head_weight = nn.Parameter(torch.load('Model/Gwen4B_lm_head.pth').T)
# lm_head_weight = nn.Parameter(torch.load('Model/lm_head_weight.pth'))

# # lm_head_weight = nn.Parameter(torch.load('./Model/Gwen8B_lm_head.pth').T)
lm_head_weight.requires_grad_(True);

[2308, 7414]


In [3]:
train_data, holdout_data = load_grouped_data()
# old_to_new = torch.load("Model/vocab_mapping.pt")
dataloader = DataLoader(
    TTTDataset_iter(train_data, holdout_data, tokenizer, None, samples_per_epoch=2000),
    batch_size=1,
    collate_fn=lambda x: x[0]
)

# for test_idx_info, input_ids, vi_index, labels in dataloader:
#     break

#### Fine-tune lm_head

In [4]:
epochs = 10
accumulation_steps = 64
lr = 1e-4
clip = 1e-2
label_smoothing = 0.1

In [5]:
trainable_params = [lm_head_weight]
optimizer = torch.optim.Adam(trainable_params,lr = lr)
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing, reduction='none')
print(len(trainable_params))

1


In [None]:
start_time = time.time()
train_loss_accum = 0
val_loss_accum = 0
prob_list = defaultdict(list)
for epoch in range(epochs):
    for i, (test_idx_info, input_ids, vi_index, labels) in enumerate(dataloader):
        with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
            input_ids, vi_index, labels = input_ids.to('cuda'), vi_index.to('cuda'), labels.to('cuda')
            with torch.no_grad(): # as we are training the lm_head only.
                output = model.model(input_ids)
            logits = output.last_hidden_state[0, vi_index] @ lm_head_weight # (# of Violation, 4096) @ (4096, 2) -> (# of Violation, 2)
            loss = loss_fn(logits, labels) # first token is used for training
            train_loss = loss[0] / accumulation_steps
            train_loss.backward()

            # tracking the loss
            train_loss_accum += train_loss.item()
            val_loss_accum += loss[1].item()
            # TODO: track the probability of the test example in nested list
            if (i + 1) % accumulation_steps == 0:
                clip_grad_value_(trainable_params,clip)
                optimizer.step()
                optimizer.zero_grad()
    print(f"Epoch {epoch} train_loss: {train_loss_accum * accumulation_steps / (i+1)}, val_loss: {val_loss_accum / (i+1)}")
    train_loss_accum = 0
    val_loss_accum = 0
print(f"Time taken: {(time.time() - start_time)/60} minutes")

In [7]:
torch.save(lm_head_weight, 'Model/lm_head_weight.pth')

#### LORA

In [7]:
epochs = 10
accumulation_steps = 64
lr = 2e-5
clip = 2e-3
label_smoothing = 0.1

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
lm_head_weight.requires_grad_(True);
trainable_params = [param for param in model.parameters() if param.requires_grad]
trainable_params.append(lm_head_weight)
optimizer = torch.optim.Adam(trainable_params, lr = lr) 
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing, reduction='none')
print(len(trainable_params))

Unsloth: Making `model.base_model.model.model` require gradients
505


In [10]:
start_time = time.time()
train_loss_accum = 0
val_loss_accum = 0
prob_list = defaultdict(list)
for epoch in range(epochs):
    for i, (test_idx_info, input_ids, vi_index, labels) in enumerate(dataloader):
        with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
            input_ids, vi_index, labels = input_ids.to('cuda'), vi_index.to('cuda'), labels.to('cuda')
            output = model.base_model.model.model(input_ids)
            logits = output.last_hidden_state[0, vi_index] @ lm_head_weight # (# of Violation, 4096) @ (4096, 2) -> (# of Violation, 2)
            loss = loss_fn(logits, labels) # first token is used for training
            train_loss = loss[0] / accumulation_steps
            train_loss.backward()

            # tracking the loss
            train_loss_accum += train_loss.item()
            val_loss_accum += loss[1].item()
            # TODO: track the probability of the test example in nested list
            if (i + 1) % accumulation_steps == 0:
                clip_grad_value_(trainable_params,clip)
                optimizer.step()
                optimizer.zero_grad()
    print(f"Epoch {epoch} train_loss: {train_loss_accum * accumulation_steps / (i+1)}, val_loss: {val_loss_accum / (i+1)}")
    train_loss_accum = 0
    val_loss_accum = 0
print(f"Time taken: {(time.time() - start_time)/60} minutes")

Epoch 0 train_loss: 0.50803857421875, val_loss: 0.54365478515625
Epoch 1 train_loss: 0.4759853515625, val_loss: 0.49885546875
Epoch 2 train_loss: 0.44575244140625, val_loss: 0.46722314453125
Epoch 3 train_loss: 0.41596630859375, val_loss: 0.46640234375
Epoch 4 train_loss: 0.39271630859375, val_loss: 0.4431669921875
Epoch 5 train_loss: 0.37871630859375, val_loss: 0.4225458984375
Epoch 6 train_loss: 0.3790576171875, val_loss: 0.4259345703125
Epoch 7 train_loss: 0.35674755859375, val_loss: 0.40379736328125
Epoch 8 train_loss: 0.34932080078125, val_loss: 0.4596162109375
Epoch 9 train_loss: 0.3409609375, val_loss: 0.41560888671875
Time taken: 44.176959780852 minutes


In [13]:
start_time = time.time()
train_loss_accum = 0
val_loss_accum = 0
prob_list = defaultdict(list)
for epoch in range(epochs):
    for i, (test_idx_info, input_ids, vi_index, labels) in enumerate(dataloader):
        with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
            input_ids, vi_index, labels = input_ids.to('cuda'), vi_index.to('cuda'), labels.to('cuda')
            output = model.base_model.model.model(input_ids)
            logits = output.last_hidden_state[0, vi_index] @ lm_head_weight # (# of Violation, 4096) @ (4096, 2) -> (# of Violation, 2)
            loss = loss_fn(logits, labels) # first token is used for training
            train_loss = loss[0] / accumulation_steps
            train_loss.backward()

            # tracking the loss
            train_loss_accum += train_loss.item()
            val_loss_accum += loss[1].item()
            # TODO: track the probability of the test example in nested list
            if (i + 1) % accumulation_steps == 0:
                clip_grad_value_(trainable_params,clip)
                optimizer.step()
                optimizer.zero_grad()
    print(f"Epoch {epoch} train_loss: {train_loss_accum * accumulation_steps / (i+1)}, val_loss: {val_loss_accum / (i+1)}")
    train_loss_accum = 0
    val_loss_accum = 0
print(f"Time taken: {(time.time() - start_time)/60} minutes")

Epoch 0 train_loss: 0.34118310546875, val_loss: 0.38859033203125
Epoch 1 train_loss: 0.33535400390625, val_loss: 0.39411572265625
Epoch 2 train_loss: 0.3332578125, val_loss: 0.3985576171875
Epoch 3 train_loss: 0.31943701171875, val_loss: 0.37522705078125
Epoch 4 train_loss: 0.31520947265625, val_loss: 0.38720166015625
Epoch 5 train_loss: 0.3197294921875, val_loss: 0.403220703125
Epoch 6 train_loss: 0.327759765625, val_loss: 0.42085986328125
Epoch 7 train_loss: 0.3104892578125, val_loss: 0.393880859375
Epoch 8 train_loss: 0.31201025390625, val_loss: 0.41192041015625
Epoch 9 train_loss: 0.3086748046875, val_loss: 0.38175048828125
Time taken: 47.241845679283145 minutes


Reduce learning rate by 4 and use torch.float16 to be consistent with Kaggle.

In [None]:
for param_group in optimizer.param_groups:
    param_group['lr'] = lr / 4

In [None]:
start_time = time.time()
train_loss_accum = 0
val_loss_accum = 0
prob_list = defaultdict(list)
for epoch in range(5):
    for i, (test_idx_info, input_ids, vi_index, labels) in enumerate(dataloader):
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            input_ids, vi_index, labels = input_ids.to('cuda'), vi_index.to('cuda'), labels.to('cuda')
            output = model.base_model.model.model(input_ids)
            logits = output.last_hidden_state[0, vi_index] @ lm_head_weight # (# of Violation, 4096) @ (4096, 2) -> (# of Violation, 2)
            loss = loss_fn(logits, labels) # first token is used for training
            train_loss = loss[0] / accumulation_steps
            train_loss.backward()

            # tracking the loss
            train_loss_accum += train_loss.item()
            val_loss_accum += loss[1].item()
            # TODO: track the probability of the test example in nested list
            if (i + 1) % accumulation_steps == 0:
                clip_grad_value_(trainable_params,clip)
                optimizer.step()
                optimizer.zero_grad()
    print(f"Epoch {epoch} train_loss: {train_loss_accum * accumulation_steps / (i+1)}, val_loss: {val_loss_accum / (i+1)}")
    train_loss_accum = 0
    val_loss_accum = 0
print(f"Time taken: {(time.time() - start_time)/60} minutes")

In [11]:
model.save_pretrained("Model/merged_model4b")
torch.save(lm_head_weight, 'Model/lm_head_weight.pth')

In [5]:
# continue training
model = PeftModel.from_pretrained(model, "Model/merged_model4b", is_trainable=True)