In [4]:
import torch
print(torch.cuda.device_count())
from unsloth import FastLanguageModel, unsloth_train

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import pprint
import json
from pathlib import Path
import transformers
import os


1
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [5]:
max_seq_length = 0     # Unsloth auto supports RoPE Scaling internally!
# dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. Can be False.
device = f"cuda"



if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
# attn_implementation = "eager"
print(f"Using {attn_implementation} for attention computation.")
# QLora?

Using flash_attention_2 for attention computation.


## Load model

In [9]:
# model_id = 'sh2orc/Llama-3.1-Korean-8B-Instruct'
# model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
# model_id = 'Saxo/Linkbricks-Horizon-AI-Korean-Gemma-2-sft-dpo-27B'
model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'

model_dir = f"/workspace/LGHVAC_2ndyear/model/{model_id.replace('/', '-')}"

In [2]:
# model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'
# from huggingface_hub import snapshot_download
# snapshot_download(repo_id=model_id, local_dir="70b")

In [None]:
# # Tokenizer initialization
# pretrained_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch_dtype,
#     cache_dir=f"{model_dir}/cache",
#     # attn_implementation=attn_implementation,
#     local_files_only=True,
#     device_map="cuda"
# )

# # tokenizer = AutoTokenizer.from_pretrained(
# #     model_id,
# #     cache_dir=f"{model_dir}/cache",
# #     local_files_only=True
# # )
# # if not os.path.exists(f"{model_dir}/config.json"):
# pretrained_model.save_pretrained(model_dir)

In [3]:
# Tokenizer initialization
pretrained_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,  
    # max_seq_length = max_seq_length,
    dtype = torch_dtype,
    load_in_4bit = False,
    load_in_8bit=False,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=False,
        # bnb_4bit_use_double_quant=True,
        # bnb_4bit_quant_type="nf4",
        # bnb_4bit_compute_dtype=torch_dtype
        load_in_8bit=False,
        # llm_int8_enable_fp32_cpu_offload=False if not "27B" in model_id else True,
    ),
    # device_map=device,
    cache_dir=f"{model_dir}/cache",
    attn_implementation=attn_implementation,
    # local_files_only=True
)

tokenizer.padding_side = "left"
# tokenizer.truncation_side = "left"
print(f"Pad Token id: {tokenizer.pad_token_id} and Pad Token: {tokenizer.pad_token}")
print(f"EOS Token id: {tokenizer.eos_token_id} and EOS Token: {tokenizer.eos_token}")

# if not os.path.exists(f"{model_dir}/config.json"):
pretrained_model.save_pretrained(model_dir)

NameError: name 'FastLanguageModel' is not defined

# Load dataset

In [None]:
train_type = [
    "woall", # 0
    "ours" # 1
][1]
print(f"train_type: {train_type}")

In [None]:
# BASE_DATASET_DIR = Path("../dataset/v5-250228-multimetadata")
# dataset_name = "v6-250306-optimizetoken"
dataset_name = "v7-250309-reduceinputanddatefunctioncall"
BASE_DATASET_DIR = Path(f"../dataset/{dataset_name}")
print(f"BASE_DATASET_DIR: {BASE_DATASET_DIR}")
print(list(BASE_DATASET_DIR.iterdir()))

In [None]:
import re

# current_metadata = json.load(open("metadata.json", "r"))


common_prompt = open(BASE_DATASET_DIR / f"prompt.txt", "r").read()

if train_type in ["woall"]:
    # search <|FI|>~~<|FI|> and remove between them
    common_prompt = re.sub(r"\n?<\|Ours\|>(.|\n)*?<\|Ours\|>", "", common_prompt)

# remove all <||>
common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)
print(common_prompt)

# print(common_prompt)

In [None]:
scenario_dirs = [d for d in BASE_DATASET_DIR.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]
print(scenario_dirs)

In [None]:

def read_dataset(dir, path):
    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    
    result = []
    for d in data:
        if "v6" in dataset_name:
            if train_type in ["woall"]:
                del d["Response"]["ÏÉùÍ∞Å"]
        
        
        result.append({"Metadata": metadata, "Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)})
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
    # print(f"Read {len(result)} examples from {path}")
    # print(f"Type of result: {type(result)}")
    # print(f"Type of result[0]: {type(result[0])}")
    # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
    # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

dataset_trs = []
dataset_tss = []
for scenario_dir in scenario_dirs:
    dataset_trs.extend(read_dataset(scenario_dir, "onlyq_tr.json"))
    dataset_tss.extend(read_dataset(scenario_dir, "onlyq_ts.json"))

dataset_tr = Dataset.from_list(dataset_trs) # ÏÑúÎ°ú Îã§Î•∏ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Ìï©ÏπòÎ©¥ÏÑú
dataset_ts = Dataset.from_list(dataset_tss) # Mutually exclusiveÌïú Ïï†Îì§ÏùÄ None Îê®

max_seq_length = 0
def formatting_prompts_func(examples):
    convos = []
    # Iterate through each item in the batch (examples are structured as lists of values)
    for metadata, input, response in zip(examples['Metadata'], examples['Input'], examples['Response']):
        # global max_seq_length
        response.replace("    ", "")

        # print(metadata['current_datetime'])
        # print(metadata['idu_mapping'])

        answer = {
            "content": f"{response}",
            "role": "assistant"
        }
        if "llama" in model_id.lower():
            prompt = {
                "content": common_prompt,
                "role": "system"
            }
            user_input = {
                "content": f"Metadata:{metadata};Input:{input};",
                "role": "user"
            }
            convos.append([prompt, user_input, answer])
        elif "gemma" in model_id.lower():
            user_input = {
                "content": f"{common_prompt};{metadata};{input}",
                "role": "user"
            }
            convos.append([user_input, answer])
        
        
        # print("Answer length: ", len(response))
        # convos.append([prompt, user_input, answer])
        
        # if len(response) + 50 > max_seq_length:
        #     max_seq_length = len(response) + len(metadata) + len(input) + 50
            # print(response)
    
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        for convo in convos]
    
    # remove \n\nCutting Knowledge Date: BLAH BLAH \nToday Date: BLAH BLAH\n\n using regex
    texts = [re.sub(r'(\nCutting Knowledge Date:.*?\nToday Date:.*?\n\n)', '', text) for text in texts]


    return {"text": texts}

dataset_tr = dataset_tr.map(formatting_prompts_func, batched=True)
dataset_ts = dataset_ts.map(formatting_prompts_func, batched=True)

max_seq_length = max([len(tokenizer.encode(dataset_tr[i]['text'])) for i in range(len(dataset_tr))]) + 10
# max_seq_length += len(common_prompt)
print(max_seq_length)
# print(f"seq length: {len(tokenizer.encode(dataset_tr[0]['text']))}")

In [None]:
lora_r = 16
lora_alpha = 32
lora_repr = f"v7_r{lora_r}_a{lora_alpha}_{train_type}"
print(lora_repr)

In [None]:


peft_model = FastLanguageModel.get_peft_model(
    pretrained_model,
    r=lora_r,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj",
                    # "embed_tokens", 
                    # "lm_head"
                    ],
    lora_alpha=lora_alpha,
    lora_dropout=0.05,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None,   # No LoftQ, for standard fine-tuning
    max_seq_length=max_seq_length,
)
del pretrained_model


## Training config

## Train

In [None]:
torch.cuda.empty_cache()
print(len(dataset_tr))


In [None]:
import numpy as np


per_device_train_batch_size, epochs = 50, 100 # 8
gradient_accumulation_steps = int(np.ceil(len(dataset_tr) / per_device_train_batch_size))
print(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")

# clear all checkpoints
import shutil
shutil.rmtree(f"{model_dir}/chkpts/{lora_repr}", ignore_errors=True)

args = TrainingArguments(
    # num_train_epochs = 1,
    per_device_train_batch_size = per_device_train_batch_size,  # Controls the batch size per device
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,  # Accumulates gradients to simulate a larger batch
    max_steps=gradient_accumulation_steps * epochs,
    # Î¶¨ÏÜåÏä§ Ï†úÏïΩÎïåÎ¨∏Ïóê batch sizeÎ•º ÌÉÄÌòëÌï¥ÏïºÌïòÎäî Í≤ΩÏö∞Í∞Ä Î∞úÏÉù -> micro batch sizeÎ•º Ï§ÑÏù¥Í≥†,
 	# accumulated stepÏùÑ ÎäòÎ†§, Ï†ÅÏ†àÌïú sizeÎ°ú gradientÎ•º Íµ¨Ìï¥ weight update
    # https://www.youtube.com/watch?v=ptlmj9Y9iwE
    warmup_steps = gradient_accumulation_steps,
    learning_rate = 1e-4,             # Sets the learning rate for optimization
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,              # Regularization term for preventing overfitting
    lr_scheduler_type = "cosine",  # Sets the learning rate scheduler
    seed = 3407,                        
    output_dir = f"{model_dir}/chkpts/{lora_repr}",  # Output directory for checkpoints and predictions     
    report_to = "none",              # Enables Weights & Biases (W&B) logging
    logging_steps = gradient_accumulation_steps,                # Sets frequency of logging to W&B
    logging_strategy = "steps",       # Logs metrics at each specified step
    evaluation_strategy="steps",  # enable evaluation during training
    eval_steps=gradient_accumulation_steps,
    # eval_accumulation_steps=1, # ÎÇÆÏùÑÏàòÎ°ù evalÏãú ÏÇ¨Ïö©ÌïòÎäî Î©îÎ™®Î¶¨ Ï§ÑÏñ¥Îì¶
    save_steps=gradient_accumulation_steps,
    save_strategy = "steps",               
    load_best_model_at_end = True,    # Loads the best model at the end
    save_only_model = False,           # Saves entire model, not only weights
    resume_from_checkpoint = f"{model_dir}/chkpts/{lora_repr}",  # Resumes training from a checkpoint
)

from trl import SFTTrainer

trainer = SFTTrainer(
    model = peft_model,
    processing_class = tokenizer,
    train_dataset = dataset_tr,
    eval_dataset = dataset_ts,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,        # Can make training 5x faster for short sequences.
    args = args,
    # compute_metrics = compute_metrics
)

In [None]:


trainer_stats = unsloth_train(trainer)
print(trainer_stats)


# Convert gguf

In [11]:
step = 100

lora_repr = "v7_r8_a16_ours_70B"
checkpoint_dir = f"{model_dir}/chkpts/{lora_repr}/checkpoint-{step}"
output_path = f"{model_dir}/gguf/{lora_repr}-checkpoint-{step}.gguf"
lora_output_dir = f"{model_dir}/lora_output/"

if not os.path.exists(f"{model_dir}/gguf"):
    os.makedirs(f"{model_dir}/gguf")
print(checkpoint_dir)

if not os.path.exists(lora_output_dir):
    os.makedirs(lora_output_dir)
print(checkpoint_dir)



/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/chkpts/v7_r8_a16_ours_70B/checkpoint-100
/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/chkpts/v7_r8_a16_ours_70B/checkpoint-100


In [12]:
if True:
    # model = AutoModelForCausalLM.from_pretrained(
    #     checkpoint_dir,
    #     torch_dtype=torch_dtype,
    #     cache_dir=f"{model_dir}/cache",
    #     # attn_implementation=attn_implementation,
    #     local_files_only=True,
    #     device_map="cuda"
    # )

    # tokenizer = AutoTokenizer.from_pretrained(
    #     checkpoint_dir,
    #     cache_dir=f"{model_dir}/cache",
    #     local_files_only=True
    # )
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        checkpoint_dir,
        dtype = torch_dtype,
        attn_implementation=attn_implementation,
        load_in_4bit = False,
        load_in_8bit=True,
        cache_dir=f"{model_dir}/cache",
        local_files_only=True,
        device_map="cuda",
    )
    # FastLanguageModel.for_inference(model)

    tokenizer.padding_side = "left"

    # merge lora model and base pretrained model
    # model = model.merge_and_unload()


==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.216 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Unsloth 2025.3.9 patched 80 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [16]:

# model.save_pretrained_gguf(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}", tokenizer, quantization_method = "q8_0")
model.save_pretrained_merged(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}-8bit", tokenizer, save_method="merged_8bit")
# tokenizer.save_pretrained(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}")


Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...




KeyboardInterrupt: 

In [None]:
import os

# command = (
#     f"python ../../llama.cpp/convert_lora_to_gguf.py "
#     f"--base {model_dir} "              # Provide the base model config if needed
#     f"--outfile {output_path} "              # Output file for the GGUF model
#     f"--outtype f16 "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
#     f"--verbose "                            # Optional: increase logging output
#     f"{checkpoint_dir}"                      # Positional argument: path to the LoRA adapter files
# )

command = (
    f"python ../../llama.cpp/convert_hf_to_gguf.py "
    # f"--base {model_dir} "              # Provide the base model config if needed
    f"--outfile {output_path} "              # Output file for the GGUF model
    f"--outtype auto "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
    f"--verbose "                            # Optional: increase logging output
    f"{lora_output_dir}/{lora_repr}-checkpoint-{step}"                      # Positional argument: path to the LoRA adapter files
)


print("Running command:", command)
os.system(command)

Running command: python ../../llama.cpp/convert_hf_to_gguf.py --outfile /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100.gguf --outtype auto --verbose /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/lora_output//v7_r8_a16_ours_70B-checkpoint-100


INFO:hf-to-gguf:Loading model: v7_r8_a16_ours_70B-checkpoint-100
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00015.safetensors'
INFO:hf-to-gguf:choosing --outtype bf16 from first tensor type (torch.bfloat16)
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00015.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> BF16, shape = {8192, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {8192}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.int8 --> BF16, shape = {28672, 8192}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.int8 --> BF16, shape = {8192, 28672}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.int8 --> BF16, shape = {8192,

9

In [20]:
command = (
    f"../../llama.cpp/build/bin/llama-quantize "
    f"{output_path} "
    f"{output_path.replace('.gguf', '-Q4_K_M.gguf')} "                      # Positional argument: path to the LoRA adapter files
    f"Q4_K_M "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
)


print("Running command:", command)
os.system(command)

Running command: ../../llama.cpp/build/bin/llama-quantize /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100.gguf /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100-Q4_K_M.gguf Q4_K_M 


main: build = 4869 (2c9f833d)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100.gguf' to '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100-Q4_K_M.gguf' as Q4_K_M
llama_model_quantize: failed to quantize: tensor 'blk.14.ffn_up.weight' data is not within the file bounds, model is corrupted or incomplete
main: failed to quantize model from '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours_70B-checkpoint-100.gguf'


256