### RoLA 실습 

In [2]:
import transformers
import datasets
import accelerate
import peft
import bitsandbytes
import warnings
warnings.filterwarnings('ignore')

#### 보조함수 

In [3]:
import torch

def print_gpu_utilization():
    if torch.cuda.is_available():
        used_memory = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU 메모리 사용량: {used_memory:.3f} GB")
    else:
        print("런타임 유형을 GPU로 변경하세요")

print_gpu_utilization()
# 출력 결과
# GPU 메모리 사용량: 0.000 GB

import gc

def cleanup():
    if 'model' in globals():
        del globals()['model']
    if 'dataset' in globals():
        del globals()['dataset']
    gc.collect()
    torch.cuda.empty_cache()
    
from transformers import AdamW
from torch.utils.data import DataLoader

def estimate_memory_of_gradients(model):
    total_memory = 0
    for param in model.parameters():
        if param.grad is not None:
            total_memory += param.grad.nelement() * param.grad.element_size()
    return total_memory

def estimate_memory_of_optimizer(optimizer):
    total_memory = 0
    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                total_memory += v.nelement() * v.element_size()
    return total_memory

import numpy as np
from datasets import Dataset

def make_dummy_dataset():
  seq_len, dataset_size = 256, 64
  dummy_data = {
      "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
      "labels": np.random.randint(100, 30000, (dataset_size, seq_len)),
  }
  dataset = Dataset.from_dict(dummy_data)
  dataset.set_format("pt")
  return dataset


GPU 메모리 사용량: 0.000 GB


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

def load_model_and_tokenizer(model_id, peft=None):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})

    elif peft == 'lora':
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})
        lora_config = LoraConfig(
                    r=8,
                    lora_alpha=32,
                    target_modules=["query_key_value"],
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM"
                )

        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    print_gpu_utilization()
    return model, tokenizer

In [6]:
def train_model(model, dataset, training_args):
    if training_args.gradient_checkpointing:
        model.gradient_checkpointing_enable()

    train_dataloader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size)
    optimizer = AdamW(model.parameters())
    model.train()
    gpu_utilization_printed = False
    for step, batch in enumerate(train_dataloader, start=1):
        batch = {k: v.to(model.device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / training_args.gradient_accumulation_steps
        loss.backward()

        if step % training_args.gradient_accumulation_steps == 0:
            optimizer.step()
            gradients_memory = estimate_memory_of_gradients(model)
            optimizer_memory = estimate_memory_of_optimizer(optimizer)
            if not gpu_utilization_printed:
                print_gpu_utilization()
                gpu_utilization_printed = True
            optimizer.zero_grad()

    print(f"옵티마이저 상태의 메모리 사용량: {optimizer_memory / (1024 ** 3):.3f} GB")
    print(f"그레디언트 메모리 사용량: {gradients_memory / (1024 ** 3):.3f} GB")

In [9]:
from transformers import TrainingArguments, Trainer

def gpu_memory_experiment(batch_size,
                          gradient_accumulation_steps=1,
                          gradient_checkpointing=False,
                          model_id="EleutherAI/polyglot-ko-1.3b",
                          peft=None):

    print(f"배치 사이즈: {batch_size}")
    model, tokenizer = load_model_and_tokenizer(model_id, peft=peft)
    if gradient_checkpointing == True or peft == 'qlora':
        model.config.use_cache = False

    dataset = make_dummy_dataset()

    training_args = TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        gradient_checkpointing=gradient_checkpointing,
        output_dir="./result",
        num_train_epochs=1
      )

    try:
        train_model(model, dataset, training_args)
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print(e)
        else:
            raise e
    finally:
        del model, dataset
        gc.collect()
        torch.cuda.empty_cache()
        print_gpu_utilization()

In [10]:
cleanup()
print_gpu_utilization()

gpu_memory_experiment(batch_size=16, peft='lora')

torch.cuda.empty_cache()

GPU 메모리 사용량: 0.000 GB
배치 사이즈: 16


tokenizer_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/748M [00:00<?, ?B/s]

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

trainable params: 1,572,864 || all params: 1,333,383,168 || trainable%: 0.1180
GPU 메모리 사용량: 2.581 GB
GPU 메모리 사용량: 4.345 GB
옵티마이저 상태의 메모리 사용량: 0.012 GB
그레디언트 메모리 사용량: 0.006 GB
GPU 메모리 사용량: 0.016 GB


### QLoRA 실습 

In [13]:
from transformers import BitsAndBytesConfig

model_id="EleutherAI/polyglot-ko-1.3b"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def load_model_and_tokenizer(model_id, peft=None):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})

    elif peft == 'lora':
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})
        lora_config = LoraConfig(
                    r=8,
                    lora_alpha=32,
                    target_modules=["query_key_value"],
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM"
                )

        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
    elif peft == 'qlora':
        lora_config = LoraConfig(
                    r=8,
                    lora_alpha=32,
                    target_modules=["query_key_value"],
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM"
                )
        bnb_config = BitsAndBytesConfig(
                  load_in_4bit=True,
                  bnb_4bit_use_double_quant=True,
                  bnb_4bit_quant_type="nf4",
                  bnb_4bit_compute_dtype=torch.float16
              )
        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    print_gpu_utilization()
    return model, tokenizer

In [15]:
cleanup()
print_gpu_utilization()

gpu_memory_experiment(batch_size=16, peft='qlora')

torch.cuda.empty_cache()

GPU 메모리 사용량: 0.922 GB
배치 사이즈: 16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 1,572,864 || all params: 1,333,383,168 || trainable%: 0.1180
GPU 메모리 사용량: 2.065 GB
GPU 메모리 사용량: 2.604 GB
옵티마이저 상태의 메모리 사용량: 0.012 GB
그레디언트 메모리 사용량: 0.006 GB
GPU 메모리 사용량: 0.922 GB


### Trainer를 이용한 QLoRA 실습 (강의 부분) 


In [4]:
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login

# # 모델 레포지토리
# model_path = "meta-llama/Llama-3.1-8B-Instruct" #"beomi/Llama-3-Open-Ko-8B"
model_path = "EleutherAI/polyglot-ko-1.3b"


# # 데이터 path
data_path = 'DopeorNope/Ko-Optimize_Dataset'
data = load_dataset(data_path)


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import bitsandbytes as bnb
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training)


#### 토크나이저 및 데이터 준비 

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 토크나이저 세팅: QLoRA시 pad 토큰을 eos로 설정해주기
bos = tokenizer.bos_token_id # begin of sentence
eos = tokenizer.eos_token_id # end of sentence

# tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
# tokenizer.padding_side = 'right' # padding to right (otherwise SFTTrainer shows warning)

# pad = tokenizer.pad_token_id
tokenizer.pad_token_id = eos

cut_off_len = 4098
val_size = 0.005
train_on_inputs = True
add_eos_token = False


In [7]:
template = {
    "prompt_input": "아래는 문제를 설명하는 지시사항과, 구체적인 답변을 방식을 요구하는 입력이 함께 있는 문장입니다. 이 요청에 대해 적절하게 답변해주세요.\n###입력:{input}\n###지시사항:{instruction}\n###답변:\n",
    "prompt_no_input": "아래는 문제를 설명하는 지시사항입니다. 이 요청에 대해 적절하게 답변해주세요.\n###지시사항:{instruction}\n###답변:\n"
}

In [8]:
from typing import Union

def generate_prompt(
    instruction: str,
    input: Union[None, str] = None,
    label: Union[None, str] = None,
    verbose: bool = False
) -> str:
    """
    주어진 instruction, input, label을 사용하여 프롬프트를 생성하는 함수.

    Parameters:
    - instruction (str): 문제 설명 또는 지시사항.
    - template (dict): 입력이 있는 경우와 없는 경우의 템플릿을 포함한 딕셔너리.
    - input (str or None): 문제에 대한 구체적인 입력 (옵션).
    - label (str or None): 정답 또는 응답 (옵션).
    - verbose (bool): 생성된 프롬프트를 출력할지 여부.

    Returns:
    - str: 완성된 프롬프트.
    """
    if input:
        res = template["prompt_input"].format(instruction=instruction, input=input)
    else:
        res = template["prompt_no_input"].format(instruction=instruction)

    if label:
        res = f"{res}{label}"

    if verbose:
        print(res)

    return res


In [9]:
def tokenize(prompt, add_eos_token=True):
  result = tokenizer(prompt,truncation=True,max_length=cut_off_len,padding=False,return_tensors=None,)
  if (result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < cut_off_len
      and add_eos_token
      ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

  result["labels"] = result["input_ids"].copy()
  return result

In [10]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"]
        )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = generate_prompt(data_point["instruction"], data_point["input"])
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos_token)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1



        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]

    return tokenized_full_prompt

In [11]:
if val_size > 0:
  train_val = data["train"].train_test_split(test_size=val_size, shuffle=True, seed=42)
  train_data = (train_val["train"].shuffle().map(generate_and_tokenize_prompt))
  val_data = (train_val["test"].shuffle().map(generate_and_tokenize_prompt))
else:
  train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
  val_data = None

Map:   0%|          | 0/9950 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

#### 모델 준비 

In [12]:
# Quantization config 준비 -> FFT 시는 제외
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
    )


In [13]:
# Model 로드 하기
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = quantization_config, #-> FFT 시는 제외
    torch_dtype = torch.bfloat16,
    device_map = {"" : 0}
    )

model

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(30080, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=2048, out_features=6144, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear4bit(in_features=8192, out_features

In [14]:
#### 보조 함수 

#  학습 가능한 모듈 찾기 
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
  return list(lora_module_names)

print('Trainable targer module:',find_all_linear_names(model))


Trainable targer module: ['query_key_value', 'dense_4h_to_h', 'dense_h_to_4h', 'dense']


In [17]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


In [18]:
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 32,
    target_modules = ["query_key_value"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"
    )

In [19]:
model = get_peft_model(model, lora_config)
model


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(30080, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXSdpaAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
             

In [20]:
model.print_trainable_parameters()


trainable params: 3,145,728 || all params: 1,334,956,032 || trainable%: 0.2356


#### 하이퍼 파라미터 셋팅 

In [21]:
output_dir='./QLoRA_TEST'

num_epochs = 1
micro_batch_size = 1
gradient_accumulation_steps = 64
warmup_steps = 10
learning_rate = 5e-6

group_by_length = False

optimizer =  'adalomo' #'paged_adamw_8bit'

# adam 활용시
#beta1 = 0.9
#beta2 = 0.99

lr_scheduler = 'cosine'
logging_steps = 1

use_wandb = True
wandb_run_name = 'Single_GPU_Optim'

use_fp16 = False
use_bf_16 = True
evaluation_strategy = 'steps'
eval_steps = 10
save_steps = 10
save_strategy = 'steps'

In [22]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
    per_device_train_batch_size = micro_batch_size,
    per_device_eval_batch_size = micro_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    warmup_steps = warmup_steps,
    num_train_epochs = num_epochs,
    learning_rate = learning_rate,
#    adam_beta1 = beta1, # adam 활용할때 사용
#    adam_beta2 = beta2, # adam 활용할때 사용
    fp16 = use_fp16,
    bf16 = use_bf_16,
    logging_steps = logging_steps,
    optim = optimizer,
    evaluation_strategy = evaluation_strategy if val_size > 0 else "no",
    save_strategy="steps",  #스텝기준으로 save
    eval_steps = eval_steps if val_size > 0 else None,
    save_steps = save_steps,
    lr_scheduler_type=lr_scheduler,
    output_dir = output_dir,
    load_best_model_at_end = True if val_size > 0 else False ,
    group_by_length=group_by_length,
    report_to="wandb" if use_wandb else None,
    run_name=wandb_run_name if use_wandb else None,
    ),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )



In [23]:
model.config.use_cache = False


trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\YS\_netrc
[34m[1mwandb[0m: Currently logged in as: [33myoung-sub[0m ([33myoung-sub-korea-advanced-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)


In [None]:
from peft import PeftModel
import os
from dotenv import load_dotenv

load_dotenv()
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HUGGINGFACE_ID = os.getenv('HUGGINGFACE_ID')

base_model= AutoModelForCausalLM.from_pretrained(model_path, token= HUGGINGFACE_TOKEN)


merged_model= PeftModel.from_pretrained(base_model, output_dir)

merged_model= merged_model.merge_and_unload()

merged_model.push_to_hub(f'{HUGGINGFACE_ID}/Single_GPU_Llama3-8B')
tokenizer.push_to_hub(f'{HUGGINGFACE_ID}/Single_GPU_Llama3-8B')