# 양자화 모델 파인 튜닝

## LLM 양자화에 필요한 패키지 설치

In [1]:
#양자화에 필요한 패키지 설치
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


In [4]:
from transformers import GPTNeoForCausalLM, GPTNeoConfig, AdamW, get_linear_schedule_with_warmup
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig
import torch
import pandas as pd
import accelerate

## 트랜스포머에서 BitsandBytesConfig를 통해 양자화 매개변수 정의하기

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Backbone model로 경량화 모델 로드하기 - 양자화

In [3]:
model_id = "kyujinpy/Ko-PlatYi-6B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [5]:
# 모델 살펴보기
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(78464, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (n

## Backbone model이 잘 실행되는지 확인

In [None]:
device = "cuda:0"

messages = [
    {"role": "user", "content": "Make Nature Language Processing study plan "}
]


encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)


generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

## Fine Tuning 데이터 로드 및 전처리

In [7]:
# Load the dataset
def load_dataset(file_path):
    data = pd.read_csv('/content/blog_crawling_new.csv', encoding='utf-8')
    return data['내용'].tolist()  # '본문' 대신 '내용'을 사용

# Fine-tuning dataset
train_data = load_dataset('/content/blog_crawling_new.csv')

In [None]:
train_data

In [12]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # or any other token you want to use for padding

# Tokenize the dataset
max_seq_length = 1024
tokenized_data = tokenizer(train_data, add_special_tokens=True, max_length=max_seq_length, truncation=True, padding=True, return_tensors='pt')

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create the dataset and dataloader
train_dataset = CustomDataset(tokenized_data)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)


## Backbone model을 peft로 wrapping

In [8]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, AutoConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from peft import LoraConfig, TaskType
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader

In [10]:
# peft로 fine tuning 경량화
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [11]:
from peft import get_peft_model

# backcone model을 peft로 wrapping
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# train parameter 수를 많이 줄임
# 6,182,801,408 -> 3,276,800 약 0.05%의 파라미터만 update 하면 됨

trainable params: 3,276,800 || all params: 6,182,801,408 || trainable%: 0.05299862932294914


## Final model config 설정 및 Fine tuning 진행

In [19]:
# Model configuration
# backbone model config 설정
config = AutoConfig.from_pretrained(model_id)
config.num_train_epochs = 3
config.per_device_train_batch_size = 2
config.gradient_accumulation_steps = 1
config.learning_rate = 5e-5
config.logging_steps = 10
config.save_steps = 100
config.warmup_steps = 100
config.weight_decay = 0.01
config.load_in_8bit = True
config.fp16 = True
config.output_attentions = False
config.output_hidden_states = False
config.return_dict_in_generate = True

In [15]:
# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config.learning_rate, eps=1e-8, no_deprecation_warning=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=config.num_train_epochs * len(train_dataloader))

In [None]:
# Training loop
for epoch in range(config.num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # 위 코드에서 TypeError: LlamaForCausalLM.forward() got an unexpected keyword argument 'decoder_input_ids' 이런 오류 발생
        # 어케 해결할지 모르겠음
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        if config.logging_steps > 0 and (step + 1) % config.logging_steps == 0:
            print(f'Epoch: {epoch + 1}/{config.num_train_epochs}, Step: {step + 1}/{len(train_dataloader)}, Loss: {loss.item()}')

    # Save the model
    if (epoch + 1) % config.save_steps == 0:
        model.save_pretrained(f'path/to/save/model_epoch_{epoch + 1}')