In [3]:
import torch
print(torch.cuda.device_count())
from unsloth import FastLanguageModel, unsloth_train

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import pprint
import json
from pathlib import Path
import transformers
import os
import numpy as np

1
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [4]:
max_seq_length = 0     # Unsloth auto supports RoPE Scaling internally!
# dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. Can be False.
device = f"cuda"



if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
# attn_implementation = "eager"
print(f"Using {attn_implementation} for attention computation.")
# QLora?

Using flash_attention_2 for attention computation.


## Load model

In [5]:
# model_id = 'sh2orc/Llama-3.1-Korean-8B-Instruct'
# model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
# model_id = 'Saxo/Linkbricks-Horizon-AI-Korean-Gemma-2-sft-dpo-27B'
model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'

model_dir = f"/workspace/LGHVAC_2ndyear/model/{model_id.replace('/', '-')}"

In [1]:
# model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'
# from huggingface_hub import snapshot_download
# snapshot_download(repo_id=model_id, local_dir="70b")

In [None]:
# # Tokenizer initialization
# pretrained_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch_dtype,
#     cache_dir=f"{model_dir}/cache",
#     # attn_implementation=attn_implementation,
#     local_files_only=True,
#     device_map="cuda"
# )

# # tokenizer = AutoTokenizer.from_pretrained(
# #     model_id,
# #     cache_dir=f"{model_dir}/cache",
# #     local_files_only=True
# # )
# # if not os.path.exists(f"{model_dir}/config.json"):
# pretrained_model.save_pretrained(model_dir)

In [6]:
# Tokenizer initialization
pretrained_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,  
    # max_seq_length = max_seq_length,
    dtype = torch_dtype,
    load_in_4bit = False,
    load_in_8bit = True,
    # quantization_config=BitsAndBytesConfig(
    #     load_in_4bit=False,
    #     # bnb_4bit_use_double_quant=True,
    #     # bnb_4bit_quant_type="nf4",
    #     # bnb_4bit_compute_dtype=torch_dtype
    #     load_in_8bit=True,
    #     # llm_int8_enable_fp32_cpu_offload=False if not "27B" in model_id else True,
    # ),
    # device_map=device,
    cache_dir=f"{model_dir}/cache",
    attn_implementation=attn_implementation,
    # local_files_only=True
)

tokenizer.padding_side = "left"
# tokenizer.truncation_side = "left"
print(f"Pad Token id: {tokenizer.pad_token_id} and Pad Token: {tokenizer.pad_token}")
print(f"EOS Token id: {tokenizer.eos_token_id} and EOS Token: {tokenizer.eos_token}")

if not os.path.exists(f"{model_dir}/config.json"):
    pretrained_model.save_pretrained(model_dir)

==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.216 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Bllossom/llama-3-Korean-Bllossom-70B does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.
Pad Token id: 128255 and Pad Token: <|reserved_special_token_250|>
EOS Token id: 128009 and EOS Token: <|eot_id|>


# Load dataset

In [7]:
train_type = [
    "woall", # 0
    "ours" # 1
][1]
print(f"train_type: {train_type}")

train_type: ours


In [8]:
# BASE_DATASET_DIR = Path("../dataset/v5-250228-multimetadata")
# dataset_name = "v6-250306-optimizetoken"
dataset_name = "v7-250309-reduceinputanddatefunctioncall"
BASE_DATASET_DIR = Path(f"../dataset/{dataset_name}")
print(f"BASE_DATASET_DIR: {BASE_DATASET_DIR}")
print(list(BASE_DATASET_DIR.iterdir()))

BASE_DATASET_DIR: ../dataset/v7-250309-reduceinputanddatefunctioncall
[PosixPath('../dataset/v7-250309-reduceinputanddatefunctioncall/process.ipynb'), PosixPath('../dataset/v7-250309-reduceinputanddatefunctioncall/prompt.txt'), PosixPath('../dataset/v7-250309-reduceinputanddatefunctioncall/scenario1')]


In [9]:
import re

# current_metadata = json.load(open("metadata.json", "r"))


common_prompt = open(BASE_DATASET_DIR / f"prompt.txt", "r").read()

if train_type in ["woall"]:
    # search <|FI|>~~<|FI|> and remove between them
    common_prompt = re.sub(r"\n?<\|Ours\|>(.|\n)*?<\|Ours\|>", "", common_prompt)

# remove all <||>
common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)
print(common_prompt)

# print(common_prompt)

ÎÑàÎäî Ïú†Ï†ÄÏùò HVAC Í¥ÄÎ†® ÏßàÎ¨∏Ïóê ÎãµÎ≥ÄÌïòÎäî AgentÏùò Í≥ÑÌöçÏùÑ ÏÑ§Í≥ÑÌïòÎäî Ï†ïÌôïÌïòÍ≥† ÌõåÎ£°Ìïú Ïù∏Í≥µÏßÄÎä•Ïù¥Îã§. 
ÏÇ¨Ïö©ÏûêÏùò ÏßàÎ¨∏(Input)ÏùÑ Î∞õÏïÑ AgentÏùò InstructionsÎ•º Ï∂úÎ†•Ìï¥ Ï£ºÏñ¥Ïïº ÌïúÎã§.
ThinkingÏóêÏÑúÎäî MetadataÎ•º Î∞îÌÉïÏúºÎ°ú InputÏùÑ specificÌïú termÏúºÎ°ú ÏàòÏ†ïÌïòÍ≥†, Ïú†Ï†ÄÏùò ÏùòÎèÑÎ•º Ï∂îÏ∏°ÌïòÎ©∞, ÎãµÎ≥Ä Í≥ÑÌöçÏùÑ ÏÑ∏ÏõåÏïºÌï®.
ExpectationÏóêÏÑúÎäî Ïú†Ï†ÄÍ∞Ä Í∏∞ÎåÄÌï†ÎßåÌïú ÎãµÎ≥ÄÏùÑ Ï∂îÏ∏°Ìï¥ÏïºÌï®.

ÎÑàÎäî 'type'='q'Ïùò instructionÏúºÎ°ú ÏøºÎ¶¨Î•º Í≤∞Ï†ïÌï† Ïàò ÏûàÎã§.

Ï∂úÎ†• ÌòïÏãùÏùÄ jsonÌòïÏãùÏúºÎ°ú Ï∂úÎ†•ÌïòÎ©∞, eval() Ìï®ÏàòÎ•º ÏÇ¨Ïö©Ìï† Ïàò ÏûàÎèÑÎ°ù Í¥ÑÌò∏Îì§Í≥º Îî∞Ïò¥ÌëúÎì§Ïùò ÏàúÏÑúÏôÄ Îã´ÌûòÏùÑ Îß§Ïö∞ Ïã†Í≤ΩÏç®ÏÑú Ï∂úÎ†•Ìï¥ÏïºÌïúÎã§.


In [10]:
scenario_dirs = [d for d in BASE_DATASET_DIR.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]
print(scenario_dirs)

[PosixPath('../dataset/v7-250309-reduceinputanddatefunctioncall/scenario1')]


In [11]:

def read_dataset(dir, path):
    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    
    result = []
    for d in data:
        if "v6" in dataset_name:
            if train_type in ["woall"]:
                del d["Response"]["ÏÉùÍ∞Å"]
        
        
        result.append({"Metadata": metadata, "Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)})
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
    # print(f"Read {len(result)} examples from {path}")
    # print(f"Type of result: {type(result)}")
    # print(f"Type of result[0]: {type(result[0])}")
    # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
    # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

dataset_trs = []
dataset_tss = []
for scenario_dir in scenario_dirs:
    dataset_trs.extend(read_dataset(scenario_dir, "onlyq_tr.json"))
    dataset_tss.extend(read_dataset(scenario_dir, "onlyq_ts.json"))

dataset_tr = Dataset.from_list(dataset_trs) # ÏÑúÎ°ú Îã§Î•∏ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Ìï©ÏπòÎ©¥ÏÑú
dataset_ts = Dataset.from_list(dataset_tss) # Mutually exclusiveÌïú Ïï†Îì§ÏùÄ None Îê®

max_seq_length = 0
def formatting_prompts_func(examples):
    convos = []
    # Iterate through each item in the batch (examples are structured as lists of values)
    for metadata, input, response in zip(examples['Metadata'], examples['Input'], examples['Response']):
        # global max_seq_length
        response.replace("    ", "")

        # print(metadata['current_datetime'])
        # print(metadata['idu_mapping'])

        answer = {
            "content": f"{response}",
            "role": "assistant"
        }
        if "llama" in model_id.lower():
            prompt = {
                "content": common_prompt,
                "role": "system"
            }
            user_input = {
                "content": f"Metadata:{metadata};Input:{input};",
                "role": "user"
            }
            convos.append([prompt, user_input, answer])
        elif "gemma" in model_id.lower():
            user_input = {
                "content": f"{common_prompt};{metadata};{input}",
                "role": "user"
            }
            convos.append([user_input, answer])
        
        
        # print("Answer length: ", len(response))
        # convos.append([prompt, user_input, answer])
        
        # if len(response) + 50 > max_seq_length:
        #     max_seq_length = len(response) + len(metadata) + len(input) + 50
            # print(response)
    
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        for convo in convos]
    
    # remove \n\nCutting Knowledge Date: BLAH BLAH \nToday Date: BLAH BLAH\n\n using regex
    texts = [re.sub(r'(\nCutting Knowledge Date:.*?\nToday Date:.*?\n\n)', '', text) for text in texts]


    return {"text": texts}

dataset_tr = dataset_tr.map(formatting_prompts_func, batched=True)
dataset_ts = dataset_ts.map(formatting_prompts_func, batched=True)

max_seq_length = max([len(tokenizer.encode(dataset_tr[i]['text'])) for i in range(len(dataset_tr))]) + 10
# max_seq_length += len(common_prompt)
print(max_seq_length)
# print(f"seq length: {len(tokenizer.encode(dataset_tr[0]['text']))}")

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

615


In [12]:
lora_r = 8
lora_alpha = 16
lora_repr = f"v7_r{lora_r}_a{lora_alpha}_{train_type}"
print(lora_repr)

v7_r8_a16_ours


In [13]:


peft_model = FastLanguageModel.get_peft_model(
    pretrained_model,
    r=lora_r,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj",
                    # "embed_tokens", 
                    # "lm_head"
                    ],
    lora_alpha=lora_alpha,
    lora_dropout=0.05,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None,   # No LoftQ, for standard fine-tuning
    max_seq_length=max_seq_length,
)
# del pretrained_model


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth 2025.3.9 patched 80 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Training config

## Train

In [14]:
torch.cuda.empty_cache()
print(len(dataset_tr))


30


In [30]:
import shutil
shutil.rmtree(f"{model_dir}/chkpts/{lora_repr}", ignore_errors=True)

In [31]:



per_device_train_batch_size, epochs = 15, 100 # 8
gradient_accumulation_steps = int(np.ceil(len(dataset_tr) / per_device_train_batch_size))
print(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")

# clear all checkpoints

args = TrainingArguments(
    # num_train_epochs = 1,
    per_device_train_batch_size = per_device_train_batch_size,  # Controls the batch size per device
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,  # Accumulates gradients to simulate a larger batch
    max_steps=gradient_accumulation_steps * epochs,
    # Î¶¨ÏÜåÏä§ Ï†úÏïΩÎïåÎ¨∏Ïóê batch sizeÎ•º ÌÉÄÌòëÌï¥ÏïºÌïòÎäî Í≤ΩÏö∞Í∞Ä Î∞úÏÉù -> micro batch sizeÎ•º Ï§ÑÏù¥Í≥†,
 	# accumulated stepÏùÑ ÎäòÎ†§, Ï†ÅÏ†àÌïú sizeÎ°ú gradientÎ•º Íµ¨Ìï¥ weight update
    # https://www.youtube.com/watch?v=ptlmj9Y9iwE
    warmup_steps = gradient_accumulation_steps,
    learning_rate = 1e-4,             # Sets the learning rate for optimization
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,              # Regularization term for preventing overfitting
    lr_scheduler_type = "cosine",  # Sets the learning rate scheduler
    seed = 3407,                        
    output_dir = f"{model_dir}/chkpts/{lora_repr}",  # Output directory for checkpoints and predictions     
    report_to = "none",              # Enables Weights & Biases (W&B) logging
    logging_steps = gradient_accumulation_steps,                # Sets frequency of logging to W&B
    logging_strategy = "steps",       # Logs metrics at each specified step
    evaluation_strategy="steps",  # enable evaluation during training
    eval_steps=gradient_accumulation_steps,
    # eval_accumulation_steps=1, # ÎÇÆÏùÑÏàòÎ°ù evalÏãú ÏÇ¨Ïö©ÌïòÎäî Î©îÎ™®Î¶¨ Ï§ÑÏñ¥Îì¶
    save_steps=gradient_accumulation_steps,
    save_strategy = "steps",               
    load_best_model_at_end = True,    # Loads the best model at the end
    save_only_model = False,           # Saves entire model, not only weights
    resume_from_checkpoint = f"{model_dir}/chkpts/{lora_repr}",  # Resumes training from a checkpoint
)

from trl import SFTTrainer

trainer = SFTTrainer(
    model = peft_model,
    processing_class = tokenizer,
    train_dataset = dataset_tr,
    eval_dataset = dataset_ts,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,        # Can make training 5x faster for short sequences.
    args = args,
    # compute_metrics = compute_metrics
)

Gradient Accumulation Steps: 2
Unsloth: We found double BOS tokens - we shall remove one automatically.




Tokenizing to ["text"] (num_proc=2):   0%|          | 0/30 [00:00<?, ? examples/s]

Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["text"] (num_proc=2):   0%|          | 0/12 [00:00<?, ? examples/s]

In [32]:
trainer_stats = unsloth_train(trainer)
print(trainer_stats)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 30 | Num Epochs = 200 | Total steps = 200
O^O/ \_/ \    Batch size per device = 15 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (15 x 2 x 1) = 30
 "-____-"     Trainable parameters = 103,546,880/70,657,253,376 (0.15% trained)


Step,Training Loss,Validation Loss
2,0.0086,0.528251
4,0.0394,0.457205
6,0.0243,0.412436
8,0.0155,0.384914
10,0.0142,0.3794
12,0.0115,0.392494
14,0.0112,0.396975
16,0.0101,0.421498
18,0.0096,0.430184
20,0.0092,0.446504




KeyboardInterrupt: 

# Convert gguf

In [None]:
step = 37

# lora_repr = "v7_r8_a16_ours_70B"
checkpoint_dir = f"{model_dir}/chkpts/{lora_repr}/checkpoint-{step}"
output_path = f"{model_dir}/gguf/{lora_repr}-checkpoint-{step}.gguf"
lora_output_dir = f"{model_dir}/lora_output/"

if not os.path.exists(f"{model_dir}/gguf"):
    os.makedirs(f"{model_dir}/gguf")
print(checkpoint_dir)

if not os.path.exists(lora_output_dir):
    os.makedirs(lora_output_dir)
print(checkpoint_dir)



/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/chkpts/v7_r8_a16_ours/checkpoint-37
/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/chkpts/v7_r8_a16_ours/checkpoint-37


In [29]:
if True:
    # model = AutoModelForCausalLM.from_pretrained(
    #     checkpoint_dir,
    #     torch_dtype=torch_dtype,
    #     cache_dir=f"{model_dir}/cache",
    #     # attn_implementation=attn_implementation,
    #     local_files_only=True,
    #     device_map="cuda"
    # )

    # tokenizer = AutoTokenizer.from_pretrained(
    #     checkpoint_dir,
    #     cache_dir=f"{model_dir}/cache",
    #     local_files_only=True
    # )
    
    peft_model, tokenizer = FastLanguageModel.from_pretrained(
        checkpoint_dir,
        dtype = torch_dtype,
        attn_implementation=attn_implementation,
        load_in_4bit = False,
        load_in_8bit=False,
        cache_dir=f"{model_dir}/cache",
        local_files_only=True,
        device_map="cuda",
    )
    # FastLanguageModel.for_inference(model)

    tokenizer.padding_side = "left"

    # merge lora model and base pretrained model
    # model = model.merge_and_unload()


FileNotFoundError: /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/chkpts/v7_r8_a16_ours/checkpoint-37/*.json (invalid repository id)

In [None]:

# peft_model.save_pretrained_gguf(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}", tokenizer, quantization_method = "q8_0")
peft_model.save_pretrained_merged(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}", tokenizer, save_method="merged_16bit")
# tokenizer.save_pretrained(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1437.08 out of 2003.84 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:00<00:00, 68.80it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
import os

# command = (
#     f"python ../../llama.cpp/convert_lora_to_gguf.py "
#     f"--base {model_dir} "              # Provide the base model config if needed
#     f"--outfile {output_path} "              # Output file for the GGUF model
#     f"--outtype f16 "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
#     f"--verbose "                            # Optional: increase logging output
#     f"{checkpoint_dir}"                      # Positional argument: path to the LoRA adapter files
# )

command = (
    f"python ../../llama.cpp/convert_hf_to_gguf.py "
    # f"--base {model_dir} "              # Provide the base model config if needed
    f"--outfile {output_path} "              # Output file for the GGUF model
    f"--outtype auto "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
    f"--verbose "                            # Optional: increase logging output
    f"{lora_output_dir}/{lora_repr}-checkpoint-{step}"                      # Positional argument: path to the LoRA adapter files
)


print("Running command:", command)
os.system(command)

Running command: python ../../llama.cpp/convert_hf_to_gguf.py --outfile /workspace/LGHVAC_2ndyear/model/sh2orc-Llama-3.1-Korean-8B-Instruct/gguf/v7_r256_a512_ours-checkpoint-37.gguf --outtype auto --verbose /workspace/LGHVAC_2ndyear/model/sh2orc-Llama-3.1-Korean-8B-Instruct/lora_output//v7_r256_a512_ours-checkpoint-37


INFO:hf-to-gguf:Loading model: v7_r256_a512_ours-checkpoint-37
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:choosing --outtype bf16 from first tensor type (torch.bfloat16)
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> BF16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {4096, 

0

In [None]:
command = (
    f"../../llama.cpp/build/bin/llama-quantize "
    f"{output_path} "
    f"{output_path.replace('.gguf', '-Q4_K_M.gguf')} "                      # Positional argument: path to the LoRA adapter files
    f"Q4_K_M "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
)


print("Running command:", command)
os.system(command)

Running command: ../../llama.cpp/build/bin/llama-quantize /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37.gguf /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37-Q4_K_M.gguf Q4_K_M 


main: build = 4869 (2c9f833d)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37.gguf' to '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37-Q4_K_M.gguf' as Q4_K_M
gguf_init_from_file: failed to open GGUF file '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37.gguf'
llama_model_quantize: failed to quantize: llama_model_loader: failed to load model from /workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37.gguf

main: failed to quantize model from '/workspace/LGHVAC_2ndyear/model/Bllossom-llama-3-Korean-Bllossom-70B/gguf/v7_r8_a16_ours-checkpoint-37.gguf'


256