In [1]:
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
!pip install pandas sentence_transformers transformers==4.44.0.dev0 tqdm pyarrow wandb spacy matplotlib
!pip install bitsandbytes==0.43.2 accelerate==0.32.0 appdirs loralib black black[jupyter] datasets fire sentencepiece scipy numpy scikit-learn
!pip install git+https://github.com/huggingface/peft

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting git+https://github.com/huggingface/peft
  Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-21zkqb6b
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-21zkqb6b
  Resolved https://github.com/huggingface/peft to commit e6cd24c907565040ee1766a5735afe3d13a71164
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import pandas as pd
import numpy as np
import torch
import re
import shutil
import os
import transformers
import datasets
import transformers
import sys

from typing import List, Union
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
from transformers import pipeline
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from torch.nn import functional as F

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict
)
from peft import PeftModel

from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Fix SEED
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

In [4]:
model_name = "meta-llama/Meta-Llama-3.1-8B"
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

quantized_model = AutoModelForCausalLM.from_pretrained(
	model_name, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 4/4 [10:45<00:00, 161.29s/it]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.76s/it]


In [5]:
device = 'auto' 
base_LLM_model = "meta-llama/Meta-Llama-3.1-8B"
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
config = AutoConfig.from_pretrained(base_LLM_model)

model = AutoModelForCausalLM.from_pretrained(
    base_LLM_model,
    config = config,
    # load_in_8bit=True, # LoRA
    #load_in_4bit=True, # Quantization Load
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(base_LLM_model)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.16s/it]


In [6]:
# Check special token
bos = tokenizer.bos_token_id 
eos = tokenizer.eos_token_id 
pad = tokenizer.pad_token_id
tokenizer.padding_side = "right" 

if (pad == None) or (pad == eos):
    tokenizer.pad_token_id = 0

In [7]:
# 하이퍼 파라미터

# 데이터셋과 훈련 횟수와 관련된 하이퍼 파라미터
batch_size = 16
num_epochs = 3
micro_batch = 1
gradient_accumulation_steps = batch_size // micro_batch

# 훈련 방법에 대한 하이퍼 파라미터
cutoff_len = 4096
lr_scheduler = 'cosine'
warmup_ratio = 0.06 # warmup_steps = 100
learning_rate = 4e-4
optimizer = 'adamw_torch'
weight_decay = 0.01
max_grad_norm = 1.0

# LoRA config
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = ["gate_proj", "down_proj", "up_proj"]

# Tokenizer에서 나오는 input값 설정 옵션
train_on_inputs = False
add_eos_token = False

# Others
resume_from_checkpoint = False 
output_dir = './custom_LLM_llama'

In [9]:
data = pd.read_csv('./train_data_JJGG.csv')
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            formatted_data.append({
            'input':'',
            'instruction': row[q_col],
                'data_source': '',
                'output': row[a_col]
            })
formatted_df = pd.DataFrame(formatted_data)
formatted_df.shape

644it [00:00, 16294.55it/s]


(6440, 4)

In [10]:
from datasets import Dataset
dacon_dataset = Dataset.from_pandas(formatted_df)

# Instruction tuning을 위한 template 작성
instruct_template = {
    "prompt_input": "아래는 작업을 설명하는 지침과 추가 입력을 제공하는 입력이 짝을 이루는 예제입니다. 요청을 적절히 완료하는 답변을 작성해주세요.\n\n### 지침:\n{instruction}\n\n### 입력:\n{input}\n\n### 답변:\n",
    "prompt_no_input" : "아래는 도배 분야와 관련된 질문입니다. 질문에 적절한 답변을 간단하게 작성해주세요. \n\n### 지침:\n{instruction}\n\n### 답변:\n",
    "response_split": "### 답변:"
}

In [11]:
# 데이터셋 불러오는 클래스
class Prompter(object):

    def __init__(self, verbose: bool = False):
        self.template = instruct_template

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:

        if input: # input text가 있다면
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )

        if label:
            res = f"{res}{label}"

        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

prompter = Prompter()

In [12]:
# Token generation 함수
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )

    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):

        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"])

    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:

        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"])

        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token)

        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]
    return tokenized_full_prompt

In [13]:
# 훈련 셋 만들기
val_data = None
train_data = dacon_dataset.shuffle() # random
train_data = train_data.map(generate_and_tokenize_prompt)

# LoRA config 정의
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM")

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config) # Applying LoRA

Map: 100%|█████████████████████████████████████████████████████████████████| 6440/6440 [00:02<00:00, 2615.44 examples/s]


In [14]:
if resume_from_checkpoint:
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # All checkpoint

    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model
        resume_from_checkpoint = (
            True
        ) 

    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)

    else:
        print(f"Checkpoint {checkpoint_name} not found")

In [15]:
# Trainer class 정의
trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size = micro_batch,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_ratio=warmup_ratio,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            fp16=True,
            logging_steps=1,
            optim="adamw_torch",
            evaluation_strategy="no",
            save_strategy="steps",
            max_grad_norm = max_grad_norm,
            save_steps = 30, 
            lr_scheduler_type=lr_scheduler,
            output_dir=output_dir,
            save_total_limit=2,
            load_best_model_at_end=False,
            ddp_find_unused_parameters=False,
            group_by_length = False,
            report_to="none"
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )

model.config.use_cache = False
model.print_trainable_parameters() 

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)
    
torch.cuda.empty_cache()
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

trainable params: 28,311,552 || all params: 8,058,572,800 || trainable%: 0.3513


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
1,1.7366
2,1.7638
3,1.7763
4,1.7134
5,1.8552
6,1.7917
7,1.6341
8,1.8835
9,1.7208
10,1.9693




TrainOutput(global_step=1206, training_loss=0.6566540499898925, metrics={'train_runtime': 21096.7381, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.057, 'total_flos': 1.4037601124293018e+17, 'train_loss': 0.6566540499898925, 'epoch': 2.996273291925466})

In [16]:
# 모델 저장
model.save_pretrained(output_dir)
model_path = os.path.join(output_dir, "pytorch_model.bin")
torch.save({}, model_path)
tokenizer.save_pretrained(output_dir)

# 훈련된 LoRA layer와 base LLM 병합(merge)
torch.cuda.empty_cache()

base_model = AutoModelForCausalLM.from_pretrained(
    base_LLM_model,
    return_dict = True,
    torch_dtype=torch.float16,
    device_map=device)

model = PeftModel.from_pretrained(base_model, output_dir, device)
model = model.merge_and_unload() 

final_save_folder = './custom_LLM_llama_final'
model.save_pretrained(final_save_folder)
tokenizer.save_pretrained(final_save_folder)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:46<00:00, 11.72s/it]


('./custom_LLM_llama_final/tokenizer_config.json',
 './custom_LLM_llama_final/special_tokens_map.json',
 './custom_LLM_llama_final/tokenizer.json')