### !중요! GPU 초기화

In [1]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

###

In [None]:
import argparse

from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

from trl import (
    ModelConfig,
    ScriptArguments,
    SFTConfig,
    SFTTrainer,
    TrlParser,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config
)

In [3]:
model_args = ModelConfig(
    model_name_or_path='./models/EXAONE-3.5-7.8B-Instruct/',
    torch_dtype="bfloat16",
    trust_remote_code=True,
    
    
    ####################
    # lora 
    ####################
    use_peft= True,
    lora_r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    lora_target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ],
    lora_task_type="CAUSAL_LM",
    
    ####################
    # quantization
    ####################
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    use_bnb_nested_quant= False    
)

In [4]:
training_args = SFTConfig(
    output_dir = './models/EXAONE-3.5-7.8B-Instruct-SFT/',
    eval_strategy='no',
    push_to_hub=False,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=100,
    # max_steps=,
    fp16=True,
    packing = False,
    max_seq_length = 1024
    # packing=True
)

In [5]:
script_args = ScriptArguments(
    dataset_name='./datasets/base_data2',
    dataset_config = None,
    dataset_train_split='train'
)

In [6]:
def train_llm(script_args, training_args, model_args):
    quantization_config = get_quantization_config(model_args)
    
    model_kwargs = dict(
        revision = model_args.model_revision,
        trust_remote_code = model_args.trust_remote_code,
        attn_implementation = model_args.attn_implementation,
        torch_dtype = model_args.torch_dtype,
        use_cache=False if training_args.gradient_checkpointing else True,
        device_map = get_kbit_device_map() if quantization_config is not None else None,
        quantization_config = quantization_config,
    )
    
    training_args.model_init_kwargs = model_kwargs
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    dataset = load_from_disk(script_args.dataset_name)
    
    trainer = SFTTrainer(
        model = model_args.model_name_or_path,
        args = training_args,
        train_dataset = dataset[script_args.dataset_train_split],
        processing_class = tokenizer,
        peft_config = get_peft_config(model_args),
    )
    
    trainer.train()
    
    trainer.save_model(training_args.output_dir)
    if training_args.push_to_hub:
        trainer.push_to_hub(dataset_name=script_args.dataset_name)

In [7]:
train_llm(script_args, training_args, model_args)

##if train continue
# traner.train(resume_from_checkpoint = True)

Loading checkpoint shards: 100%|██████████| 7/7 [00:33<00:00,  4.73s/it]


{'loss': 11.245, 'grad_norm': 5.821979522705078, 'learning_rate': 0.00019510565162951537, 'epoch': 10.0}




{'loss': 6.4042, 'grad_norm': 4.1300129890441895, 'learning_rate': 0.00018090169943749476, 'epoch': 20.0}




{'loss': 4.3542, 'grad_norm': 4.039931297302246, 'learning_rate': 0.00015877852522924732, 'epoch': 30.0}


