In [None]:
import torch
print(torch.cuda.device_count())
from unsloth import FastLanguageModel, unsloth_train

from transformers import TrainerCallback, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import pprint
import json
from pathlib import Path
import transformers
import os
import numpy as np

In [None]:
max_seq_length = 0     # Unsloth auto supports RoPE Scaling internally!
# dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. Can be False.
device = f"cuda"



if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
# attn_implementation = "eager"
print(f"Using {attn_implementation} for attention computation.")
# QLora?

## Load model

In [None]:
model_id = 'sh2orc/Llama-3.1-Korean-8B-Instruct'
# model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
# model_id = 'Saxo/Linkbricks-Horizon-AI-Korean-Gemma-2-sft-dpo-27B'
# model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'

model_dir = f"/model/{model_id.replace('/', '-')}"

In [None]:
# model_id = 'Bllossom/llama-3-Korean-Bllossom-70B'
# from huggingface_hub import snapshot_download
# snapshot_download(repo_id=model_id, local_dir="70b")

In [None]:
# # Tokenizer initialization
# pretrained_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch_dtype,
#     cache_dir=f"{model_dir}/cache",
#     # attn_implementation=attn_implementation,
#     local_files_only=True,
#     device_map="cuda"
# )

# # tokenizer = AutoTokenizer.from_pretrained(
# #     model_id,
# #     cache_dir=f"{model_dir}/cache",
# #     local_files_only=True
# # )
# # if not os.path.exists(f"{model_dir}/config.json"):
# pretrained_model.save_pretrained(model_dir)

In [None]:
# Tokenizer initialization

pretrained_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    # max_seq_length = max_seq_length,
    dtype = torch_dtype,
    load_in_4bit = False,
    load_in_8bit = False,
    # quantization_config=BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    #     bnb_4bit_compute_dtype=torch_dtype
    #     # load_in_8bit=True,
    #     # llm_int8_enable_fp32_cpu_offload=False if not "27B" in model_id else True,
    # ),
    # device_map=device,
    cache_dir=f"{model_dir}/cache",
    attn_implementation=attn_implementation,
    # local_files_only=True
)
tokenizer.padding_side = "left"
# tokenizer.truncation_side = "left"
print(f"Pad Token id: {tokenizer.pad_token_id} and Pad Token: {tokenizer.pad_token}")
print(f"EOS Token id: {tokenizer.eos_token_id} and EOS Token: {tokenizer.eos_token}")

# if not os.path.exists(f"{model_dir}/config.json"):
#     pretrained_model.save_pretrained(model_dir)
#     tokenizer.save_pretrained(model_dir)

# Load dataset

In [None]:
train_type = [
    "woall", # 0
    "ours" # 1
][1]
print(f"train_type: {train_type}")

In [None]:
# BASE_DATASET_DIR = Path("../dataset/v5-250228-multimetadata")
# dataset_name = "v6-250306-optimizetoken"
dataset_name = "v7-250309-reduceinputanddatefunctioncall"
BASE_DATASET_DIR = Path(f"../dataset/{dataset_name}")
print(f"BASE_DATASET_DIR: {BASE_DATASET_DIR}")
print(list(BASE_DATASET_DIR.iterdir()))

In [None]:
import re

# current_metadata = json.load(open("metadata.json", "r"))


common_prompt = open(BASE_DATASET_DIR / f"prompt.txt", "r").read()

if train_type in ["woall"]:
    # search <|FI|>~~<|FI|> and remove between them
    common_prompt = re.sub(r"\n?<\|Ours\|>(.|\n)*?<\|Ours\|>", "", common_prompt)

# remove all <||>
common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)
print(common_prompt)

# print(common_prompt)

In [None]:
scenario_dirs = [d for d in BASE_DATASET_DIR.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]
print(scenario_dirs)

In [None]:

def read_dataset(dir, path):
    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    
    result = []
    for d in data:
        if "v6" in dataset_name:
            if train_type in ["woall"]:
                del d["Response"]["생각"]
        elif "v7" in dataset_name:
            if train_type in ["woall"]:
                del d["Response"]["Thinking"]
                expectations = d["Response"]["Expectations"]
                del d["Response"]["Expectations"]
                d["Response"]["Expectations"] = expectations
        
        tags = d["Tags"]["Style"]

        skip_tags = ["Reason"]

        skip = False
        for skip_tag in skip_tags:
            if skip_tag in tags:
                skip = True
                break
        
        if skip:
            continue

        
        result.append({"Metadata": metadata, "Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)})
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
    # print(f"Read {len(result)} examples from {path}")
    # print(f"Type of result: {type(result)}")
    # print(f"Type of result[0]: {type(result[0])}")
    # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
    # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

dataset_trs = []
dataset_tss = []
for scenario_dir in scenario_dirs:
    dataset_trs.extend(read_dataset(scenario_dir, "onlyq_tr.json"))
    dataset_tss.extend(read_dataset(scenario_dir, "onlyq_ts.json"))

dataset_tr = Dataset.from_list(dataset_trs) # 서로 다른 메타데이터 합치면서
dataset_ts = Dataset.from_list(dataset_tss) # Mutually exclusive한 애들은 None 됨

max_seq_length = 0
def formatting_prompts_func(examples):
    convos = []
    # Iterate through each item in the batch (examples are structured as lists of values)
    for metadata, input, response in zip(examples['Metadata'], examples['Input'], examples['Response']):
        # global max_seq_length
        response.replace("    ", "")

        # print(metadata['current_datetime'])
        # print(metadata['idu_mapping'])

        answer = {
            "content": f"{response}",
            "role": "assistant"
        }
        if "llama" in model_id.lower():
            prompt = {
                "content": common_prompt,
                "role": "system"
            }
            user_input = {
                "content": f"Metadata:{metadata};Input:{input};",
                "role": "user"
            }
            convos.append([prompt, user_input, answer])
        elif "gemma" in model_id.lower():
            user_input = {
                "content": f"{common_prompt};{metadata};{input}",
                "role": "user"
            }
            convos.append([user_input, answer])
        
        
        # print("Answer length: ", len(response))
        # convos.append([prompt, user_input, answer])
        
        # if len(response) + 50 > max_seq_length:
        #     max_seq_length = len(response) + len(metadata) + len(input) + 50
            # print(response)
    
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        for convo in convos]
    
    # remove \n\nCutting Knowledge Date: BLAH BLAH \nToday Date: BLAH BLAH\n\n using regex
    # texts = [re.sub(r'(\nCutting Knowledge Date:.*?\nToday Date:.*?\n\n)', '', text) for text in texts]


    return {"text": texts}

dataset_tr = dataset_tr.map(formatting_prompts_func, batched=True)
dataset_ts = dataset_ts.map(formatting_prompts_func, batched=True)

max_seq_length = max([len(tokenizer.encode(dataset_tr[i]['text'])) for i in range(len(dataset_tr))]) + 100
# max_seq_length += len(common_prompt)
print(max_seq_length)
print(dataset_tr[0])
print(len(dataset_tr), len(dataset_ts))
# print(f"seq length: {len(tokenizer.encode(dataset_tr[0]['text']))}")

In [None]:
lora_r = 256
lora_alpha = lora_r * 2
lora_repr = f"v7_r{lora_r}_a{lora_alpha}_{train_type}_16bit_adamw16bit_0322"
print(lora_repr)

In [None]:
peft_model = FastLanguageModel.get_peft_model(
    pretrained_model,
    r=lora_r,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj",
                    # "embed_tokens",
                    # "lm_head"
                    ],
    lora_alpha=lora_alpha,
    lora_dropout=0.05,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None,   # No LoftQ, for standard fine-tuning
    max_seq_length=max_seq_length,
)
# del pretrained_model


## Training config

## Train

In [None]:
torch.cuda.empty_cache()
print(len(dataset_tr))


In [None]:
# import shutil
# shutil.rmtree(f"{model_dir}/chkpts/{lora_repr}", ignore_errors=True)

In [None]:



per_device_train_batch_size, epochs = 25, 200 # 8
gradient_accumulation_steps = int(np.ceil(len(dataset_tr) / per_device_train_batch_size))
print(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")

# clear all checkpoints

args = TrainingArguments(
    # num_train_epochs = 1,
    per_device_train_batch_size = per_device_train_batch_size,  # Controls the batch size per device
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,  # Accumulates gradients to simulate a larger batch
    max_steps=gradient_accumulation_steps * epochs,
    # 리소스 제약때문에 batch size를 타협해야하는 경우가 발생 -> micro batch size를 줄이고,
 	# accumulated step을 늘려, 적절한 size로 gradient를 구해 weight update
    # https://www.youtube.com/watch?v=ptlmj9Y9iwE
    warmup_steps = gradient_accumulation_steps,
    learning_rate = 1e-4,             # Sets the learning rate for optimization
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_torch", # adamw_torch, adafactor, prodigy
    weight_decay = 0.01,              # Regularization term for preventing overfitting
    lr_scheduler_type = "cosine",  # Sets the learning rate scheduler
    seed = 3407,                        
    output_dir = f"{model_dir}/chkpts/{lora_repr}",  # Output directory for checkpoints and predictions     
    report_to = "none",              # Enables Weights & Biases (W&B) logging
    logging_steps = gradient_accumulation_steps,                # Sets frequency of logging to W&B
    logging_strategy = "steps",       # Logs metrics at each specified step
    evaluation_strategy="steps",  # enable evaluation during training
    eval_steps=gradient_accumulation_steps,
    eval_accumulation_steps=1, # 낮을수록 eval시 사용하는 메모리 줄어듦
    save_steps=gradient_accumulation_steps,
    save_strategy = "steps",               
    # load_best_model_at_end = True,    # Loads the best model at the end
    save_only_model = False,           # Saves entire model, not only weights
    resume_from_checkpoint = f"{model_dir}/chkpts/{lora_repr}",  # Resumes training from a checkpoint
)

# def compute_metrics(eval_pred):
#     pred_ids = eval_pred.predictions
#     print(pred_ids, type(pred_ids), pred_ids.shape)
#     predictions = tokenizer.batch_decode(
#         pred_ids, 
#         skip_special_tokens=True
#     )
#     print(predictions)
#     return {"accuracy": 0}

# class CustomCallback(TrainerCallback):
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         print(f"Step {state.global_step}: {logs}")

from trl import SFTTrainer
trainer = SFTTrainer(
    model = peft_model,
    processing_class = tokenizer,
    train_dataset = dataset_tr,
    eval_dataset = dataset_ts,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,        # Can make training 5x faster for short sequences.
    args = args,
    # compute_metrics = compute_metrics,
    # callbacks = [CustomCallback()]
)

In [None]:

trainer_stats = unsloth_train(trainer)

# Convert gguf

In [None]:
# # base model

# save_dir = f"{model_dir}/gguf"
# if not os.path.exists(save_dir):
#     os.makedirs(save_dir)
# command = (
#     f"python ../../llama.cpp/convert_hf_to_gguf.py "
#     # f"--base {model_dir} "              # Provide the base model config if needed
#     f"--outfile {save_dir}/base.gguf "              # Output file for the GGUF model
#     f"--outtype f16 "                      # Use f16 (or choose f32, bf16, q8_0, auto as needed)
#     f"--verbose "                            # Optional: increase logging output
#     f"{model_dir}"                      # Positional argument: path to the LoRA adapter files
# )

# print(command)

# os.system(command)

In [None]:
step = 56

# lora_repr = "v7_r8_a16_ours_70B"
checkpoint_dir = f"{model_dir}/chkpts/{lora_repr}/checkpoint-{step}"
output_path = f"{model_dir}/gguf/{lora_repr}-checkpoint-{step}.gguf"
lora_output_dir = f"{model_dir}/lora_output/"

if not os.path.exists(f"{model_dir}/gguf"):
    os.makedirs(f"{model_dir}/gguf")
print(checkpoint_dir)

if not os.path.exists(lora_output_dir):
    os.makedirs(lora_output_dir)
print(checkpoint_dir)



In [None]:
if True:
    # model = AutoModelForCausalLM.from_pretrained(
    #     checkpoint_dir,
    #     torch_dtype=torch_dtype,
    #     cache_dir=f"{model_dir}/cache",
    #     # attn_implementation=attn_implementation,
    #     local_files_only=True,
    #     device_map="cuda"
    # )

    # tokenizer = AutoTokenizer.from_pretrained(
    #     checkpoint_dir,
    #     cache_dir=f"{model_dir}/cache",
    #     local_files_only=True
    # )
    
    peft_model, tokenizer = FastLanguageModel.from_pretrained(
        checkpoint_dir,
        dtype = torch_dtype,
        attn_implementation=attn_implementation,
        load_in_4bit = False,
        load_in_8bit=False,
        cache_dir=f"{model_dir}/cache",
        local_files_only=True,
        device_map="cuda",
    )
    # FastLanguageModel.for_inference(model)

    tokenizer.padding_side = "left"

    # merge lora model and base pretrained model
    # model = model.merge_and_unload()


In [None]:

# peft_model.save_pretrained_gguf(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}", tokenizer, quantization_method = "q8_0")
peft_model.save_pretrained_merged(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}", tokenizer, save_method="merged_16bit")
# tokenizer.save_pretrained(f"{lora_output_dir}/{lora_repr}-checkpoint-{step}")


In [None]:
import os

# command = (
#     f"python ../../llama.cpp/convert_lora_to_gguf.py "
#     f"--base {model_dir} "              # Provide the base model config if needed
#     f"--outfile {output_path} "              # Output file for the GGUF model
#     f"--outtype f16 "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
#     f"--verbose "                            # Optional: increase logging output
#     f"{checkpoint_dir}"                      # Positional argument: path to the LoRA adapter files
# )

command = (
    f"python ../../llama.cpp/convert_hf_to_gguf.py "
    # f"--base {model_dir} "              # Provide the base model config if needed
    f"--outfile {output_path} "              # Output file for the GGUF model
    f"--outtype f16 "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
    f"--verbose "                            # Optional: increase logging output
    f"{lora_output_dir}/{lora_repr}-checkpoint-{step}"                      # Positional argument: path to the LoRA adapter files
)


print("Running command:", command)
os.system(command)

In [None]:
# command = (
#     f"../../llama.cpp/build/bin/llama-quantize "
#     f"{output_path} "
#     f"{output_path.replace('.gguf', '-Q4_K_M.gguf')} "                      # Positional argument: path to the LoRA adapter files
#     f"Q4_K_M "                        # Use f16 (or choose f32, bf16, q8_0, auto as needed)
# )


# print("Running command:", command)
# os.system(command)

In [None]:
import shutil
shutil.copy(
    f"{output_path}",
    f"../../src/i2i.gguf"
)



In [None]:
# ! tar -cvf - ../../src/i2i.gguf | pigz -p 128 > src/i2i.tar.gz

command = (
    f"tar -cvf - ../../src/i2i.gguf | pigz -p 128 > ../../src/i2i.tar.gz"
)

print("Running command:", command)
os.system(command)
