In [1]:
import torch
print(torch.cuda.device_count())
from unsloth import FastLanguageModel

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import pprint
import json
from pathlib import Path
import transformers
import os


1
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [2]:
train_type = [
    "woall", # 0
    "ours" # 1
][1]
print(f"train_type: {train_type}")

train_type: ours


In [3]:
# BASE_DATASET_DIR = Path("../dataset/v5-250228-multimetadata")
BASE_DATASET_DIR = Path("../dataset/v6-250306-optimizetoken")
print(f"BASE_DATASET_DIR: {BASE_DATASET_DIR}")
print(list(BASE_DATASET_DIR.iterdir()))

BASE_DATASET_DIR: ../dataset/v6-250306-optimizetoken
[PosixPath('../dataset/v6-250306-optimizetoken/prompt.txt'), PosixPath('../dataset/v6-250306-optimizetoken/process.ipynb'), PosixPath('../dataset/v6-250306-optimizetoken/scenario1'), PosixPath('../dataset/v6-250306-optimizetoken/scenario2'), PosixPath('../dataset/v6-250306-optimizetoken/scenario3')]


In [4]:
max_seq_length = 0     # Unsloth auto supports RoPE Scaling internally!
dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. Can be False.
device = f"cuda"



if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
# attn_implementation = "eager"
print(f"Using {attn_implementation} for attention computation.")
# QLora?

Using flash_attention_2 for attention computation.


In [5]:
import re

# current_metadata = json.load(open("metadata.json", "r"))


common_prompt = open(BASE_DATASET_DIR / f"prompt.txt", "r").read()

if train_type in ["woall"]:
    # search <|FI|>~~<|FI|> and remove between them
    common_prompt = re.sub(r"\n?<\|Ours\|>(.|\n)*?<\|Ours\|>", "", common_prompt)

# remove all <||>
common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)
print(common_prompt)

# print(common_prompt)

ÎÑàÎäî Ïú†Ï†ÄÏùò HVAC Í¥ÄÎ†® ÏßàÎ¨∏Ïóê ÎãµÎ≥ÄÌïòÎäî AgentÏùò Í≥ÑÌöçÏùÑ ÏÑ§Í≥ÑÌïòÎäî Ï†ïÌôïÌïòÍ≥† ÌõåÎ£°Ìïú Ïù∏Í≥µÏßÄÎä•Ïù¥Îã§. 
ÏÇ¨Ïö©ÏûêÏùò ÏßàÎ¨∏(Input)Í≥º MetadataÎ•º Í∏∞Î∞òÏúºÎ°ú
ÏÉùÍ∞ÅÏùÑ Í±∞Ïπú ÌõÑ
AgentÏùò ÏßÄÏãúÎ•º Ï∂úÎ†•Ìï¥ Ï£ºÏñ¥Ïïº ÌïúÎã§.

AgentÎäî ÏïÑÎûò DDL statementÎ°ú Íµ¨ÏÑ±Îêú databaseÏóê Ï†ëÍ∑ºÌïòÏó¨ ÏøºÎ¶¨Ìï† Ïàò ÏûàÏúºÎ©∞, 
ÎÑàÎäî "type"="q"Ïùò instructionÏúºÎ°ú ÏøºÎ¶¨Î•º Í≤∞Ï†ïÌï† Ïàò ÏûàÎã§.

<DDL statement>
CREATE TABLE IF NOT EXISTS data_t
(
    id integer NOT NULL DEFAULT nextval('data_t_id_seq'::regclass),
    idu_id integer,
    roomtemp double precision,
    settemp double precision,
    oper boolean,
    "timestamp" timestamp without time zone NOT NULL
)
    
CREATE TABLE IF NOT EXISTS idu_t
(
    id integer NOT NULL DEFAULT nextval('idu_t_id_seq'::regclass),
    name character varying(50) COLLATE pg_catalog."default",
    CONSTRAINT idu_t_pkey PRIMARY KEY (id)
)

Ï∂úÎ†• ÌòïÏãùÏùÄ jsonÌòïÏãùÏúºÎ°ú Ï∂úÎ†•ÌïòÎ©∞, eval() Ìï®ÏàòÎ•º ÏÇ¨Ïö©Ìï† Ïàò ÏûàÎèÑÎ°ù Í¥ÑÌò

In [6]:
model_id = 'sh2orc/Llama-3.1-Korean-8B-Instruct'
# model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
# model_id = 'Saxo/Linkbricks-Horizon-AI-Korean-Gemma-2-sft-dpo-27B'

model_dir = f"/workspace/model/{model_id.replace('/', '-')}"

## Load tokenizer and dataset

In [7]:
# Tokenizer initialization
pretrained_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,  
    # max_seq_length = max_seq_length,
    dtype = dtype,
    # load_in_4bit = False if not "27B" in model_id else True,
    # quantization_config=BitsAndBytesConfig(
    #     # load_in_4bit=True,
    #     # bnb_4bit_use_double_quant=True,
    #     # bnb_4bit_quant_type="nf4",
    #     # bnb_4bit_compute_dtype=torch_dtype
    #     load_in_8bit=False if not "27B" in model_id else True,
    #     llm_int8_enable_fp32_cpu_offload=False if not "27B" in model_id else True,
    # ),
    # device_map=device,
    cache_dir=f"{model_dir}/cache",
    attn_implementation=attn_implementation,
    # local_files_only=True
)

# if not os.path.exists(model_dir):
# pretrained_model.save_pretrained(model_dir)

==((====))==  Unsloth 2025.3.5: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

sh2orc/Llama-3.1-Korean-8B-Instruct does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [8]:
tokenizer.padding_side = "left"
# tokenizer.truncation_side = "left"
print(f"Pad Token id: {tokenizer.pad_token_id} and Pad Token: {tokenizer.pad_token}")
print(f"EOS Token id: {tokenizer.eos_token_id} and EOS Token: {tokenizer.eos_token}")

Pad Token id: 128004 and Pad Token: <|finetune_right_pad_id|>
EOS Token id: 128009 and EOS Token: <|eot_id|>


In [9]:
scenario_dirs = [d for d in BASE_DATASET_DIR.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]
print(scenario_dirs)

[PosixPath('../dataset/v6-250306-optimizetoken/scenario1'), PosixPath('../dataset/v6-250306-optimizetoken/scenario2'), PosixPath('../dataset/v6-250306-optimizetoken/scenario3')]


In [10]:

def read_dataset(dir, path):
    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    
    result = []
    for d in data:
        if train_type in ["woall"]:
            del d["Response"]["ÏÉùÍ∞Å"]
        
        result.append({"Metadata": metadata, "Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)})
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
    # print(f"Read {len(result)} examples from {path}")
    # print(f"Type of result: {type(result)}")
    # print(f"Type of result[0]: {type(result[0])}")
    # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
    # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

dataset_trs = []
dataset_tss = []
for scenario_dir in scenario_dirs:
    dataset_trs.extend(read_dataset(scenario_dir, "onlyq_tr.json"))
    dataset_tss.extend(read_dataset(scenario_dir, "onlyq_ts.json"))

dataset_tr = Dataset.from_list(dataset_trs) # ÏÑúÎ°ú Îã§Î•∏ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Ìï©ÏπòÎ©¥ÏÑú
dataset_ts = Dataset.from_list(dataset_tss) # Mutually exclusiveÌïú Ïï†Îì§ÏùÄ None Îê®

max_seq_length = 0
def formatting_prompts_func(examples):
    convos = []
    # Iterate through each item in the batch (examples are structured as lists of values)
    for metadata, input, response in zip(examples['Metadata'], examples['Input'], examples['Response']):
        # global max_seq_length
        response.replace("    ", "")

        # print(metadata['current_datetime'])
        # print(metadata['idu_mapping'])

        answer = {
            "content": f"{response}",
            "role": "assistant"
        }
        if "llama" in model_id.lower():
            prompt = {
                "content": common_prompt,
                "role": "system"
            }
            user_input = {
                "content": f"Metadata:{metadata};Input:{input};",
                "role": "user"
            }
            convos.append([prompt, user_input, answer])
        elif "gemma" in model_id.lower():
            user_input = {
                "content": f"{common_prompt};{metadata};{input}",
                "role": "user"
            }
            convos.append([user_input, answer])
        
        
        # print("Answer length: ", len(response))
        # convos.append([prompt, user_input, answer])
        
        # if len(response) + 50 > max_seq_length:
        #     max_seq_length = len(response) + len(metadata) + len(input) + 50
            # print(response)
    
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        for convo in convos]
    
    # remove \n\nCutting Knowledge Date: BLAH BLAH \nToday Date: BLAH BLAH\n\n using regex
    texts = [re.sub(r'(\nCutting Knowledge Date:.*?\nToday Date:.*?\n\n)', '', text) for text in texts]


    return {"text": texts}

dataset_tr = dataset_tr.map(formatting_prompts_func, batched=True)
dataset_ts = dataset_ts.map(formatting_prompts_func, batched=True)

max_seq_length = max([len(tokenizer.encode(dataset_tr[i]['text'])) for i in range(len(dataset_tr))]) + 10
# max_seq_length += len(common_prompt)
print(max_seq_length)
# print(f"seq length: {len(tokenizer.encode(dataset_tr[0]['text']))}")

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

958


In [11]:
lora_r = 128
lora_alpha = 256
lora_repr = f"v6_r{lora_r}_a{lora_alpha}_{train_type}"
print(lora_repr)

v6_r128_a256_ours


In [12]:


peft_model = FastLanguageModel.get_peft_model(
    pretrained_model,
    r=lora_r,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj",
                    # "embed_tokens", 
                    # "lm_head"
                    ],
    lora_alpha=lora_alpha,
    lora_dropout=0.05,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None,   # No LoftQ, for standard fine-tuning
    max_seq_length=max_seq_length,
)
del pretrained_model


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Training config

## Train

In [13]:
torch.cuda.empty_cache()
print(len(dataset_tr))


143


In [14]:
import numpy as np


per_device_train_batch_size, epochs = 75, 70 # 8
gradient_accumulation_steps = int(np.ceil(len(dataset_tr) / per_device_train_batch_size))
print(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")

# clear all checkpoints
import shutil
shutil.rmtree(f"{model_dir}/chkpts/{lora_repr}", ignore_errors=True)

args = TrainingArguments(
    # num_train_epochs = 1,
    per_device_train_batch_size = per_device_train_batch_size,  # Controls the batch size per device
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,  # Accumulates gradients to simulate a larger batch
    max_steps=gradient_accumulation_steps * epochs,
    # Î¶¨ÏÜåÏä§ Ï†úÏïΩÎïåÎ¨∏Ïóê batch sizeÎ•º ÌÉÄÌòëÌï¥ÏïºÌïòÎäî Í≤ΩÏö∞Í∞Ä Î∞úÏÉù -> micro batch sizeÎ•º Ï§ÑÏù¥Í≥†,
 	# accumulated stepÏùÑ ÎäòÎ†§, Ï†ÅÏ†àÌïú sizeÎ°ú gradientÎ•º Íµ¨Ìï¥ weight update
    # https://www.youtube.com/watch?v=ptlmj9Y9iwE
    warmup_steps = gradient_accumulation_steps,
    learning_rate = 1e-4,             # Sets the learning rate for optimization
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,              # Regularization term for preventing overfitting
    lr_scheduler_type = "cosine",  # Sets the learning rate scheduler
    seed = 3407,                        
    output_dir = f"{model_dir}/chkpts/{lora_repr}",  # Output directory for checkpoints and predictions     
    report_to = "none",              # Enables Weights & Biases (W&B) logging
    logging_steps = gradient_accumulation_steps,                # Sets frequency of logging to W&B
    logging_strategy = "steps",       # Logs metrics at each specified step
    evaluation_strategy="steps",  # enable evaluation during training
    eval_steps=gradient_accumulation_steps,
    # eval_accumulation_steps=1, # ÎÇÆÏùÑÏàòÎ°ù evalÏãú ÏÇ¨Ïö©ÌïòÎäî Î©îÎ™®Î¶¨ Ï§ÑÏñ¥Îì¶
    save_steps=gradient_accumulation_steps,
    save_strategy = "steps",               
    load_best_model_at_end = True,    # Loads the best model at the end
    save_only_model = False           # Saves entire model, not only weights
)

from trl import SFTTrainer

trainer = SFTTrainer(
    model = peft_model,
    processing_class = tokenizer,
    train_dataset = dataset_tr,
    eval_dataset = dataset_ts,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,        # Can make training 5x faster for short sequences.
    args = args,
    # compute_metrics = compute_metrics
)

Gradient Accumulation Steps: 1




Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["text"] (num_proc=2):   0%|          | 0/143 [00:00<?, ? examples/s]

Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["text"] (num_proc=2):   0%|          | 0/35 [00:00<?, ? examples/s]

In [15]:
from unsloth import unsloth_train

trainer_stats = unsloth_train(trainer)
print(trainer_stats)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 143 | Num Epochs = 70 | Total steps = 70
O^O/ \_/ \    Batch size per device = 150 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (150 x 1 x 1) = 150
 "-____-"     Trainable parameters = 335,544,320/4,876,144,640 (6.88% trained)


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.62 GiB. GPU 0 has a total capacity of 79.10 GiB of which 2.94 GiB is free. Process 1693471 has 76.15 GiB memory in use. Of the allocated memory 75.30 GiB is allocated by PyTorch, and 139.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)