# Mixtral-8x7B 모델 미세 조정하기

## 준비사항 

이번 실습은 `ml.p4de.24xlarge` 인스턴스에서 수행되었습니다.

그리고 PyTorch 2.3.1, Python 3.11, CUDA Version 12.1을 사용하였습니다.

In [None]:
# GPU 정보 및 CUDA 버전 확인하기
!nvidia-smi

In [None]:
# nvidia-smi에 표시된 CUDA 버전과 맞춰서 torch를 설치합니다. (이 경우 12.1)
# 여기에서 설치 명령어를 확인하세요: https://pytorch.org/get-started/locally/
%pip install -U torch --index-url https://download.pytorch.org/whl/cu121
%pip install -U transformers==4.41.2
%pip install -U peft==0.11.1
%pip install -U datasets==2.19.2
%pip install -U bitsandbytes==0.43.1
%pip install -U scipy==1.13.1
%pip install -U ipywidgets==8.1.3
%pip install -U matplotlib==3.9.0

In [None]:
!python -m bitsandbytes

In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"

train_dataset = load_dataset(dataset_name, split="train[0:800]")
eval_dataset = load_dataset(dataset_name, split="train[800:1000]")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             quantization_config=bnb_config, 
                                             device_map="auto")

## 명령어 데이터 세트 설정

다음 형식을 따라야 합니다: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1

In [None]:
# 토큰화 설정 
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = f"""

[INST] Given a question and some additional context, provide an answer. [/INST]

### Question:
{data_point['instruction']}

### Context:
{f"Here is some context: {data_point['context']}" if len(data_point["context"]) > 0 else ""}

### Response:
{data_point['response']}

</s>
"""

    tokenized_prompt = tokenizer(full_prompt)
    return tokenized_prompt

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

untokenized_text = tokenizer.decode(tokenized_train_dataset[0]['input_ids']) 
print(untokenized_text)

In [None]:
# #max_length = 400 

# tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
# tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

# untokenized_text = tokenizer.decode(tokenized_train_dataset[4]['input_ids']) 
# print(untokenized_text)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    모델에서 학습 가능한 파라미터 수를 출력합니다.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "w1",
        "w2",
        "w3",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # 일반적인 방법
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# 가속기를 적용합니다. 가속기를 제거하려면 아래 코드를 주석 처리하세요.
model = accelerator.prepare_model(model)

In [None]:
!rm -rf ./dolly_mixtral_finetune

In [None]:
import transformers
from datetime import datetime

if torch.cuda.device_count() > 1: # GPU가 1개 이상인 경우
    model.is_parallelizable = True
    model.model_parallel = True

output_dir = "./dolly_mixtral_finetune"

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=1,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=10,
        learning_rate=2.5e-5, 
        logging_steps=5,
        fp16=True, 
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # 로그를 저장할 디렉토리
        save_strategy="steps",       # 로그 스텝마다 모델 체크포인트 저장
        save_steps=10,               # 10 스텝마다 체크포인트 저장
        evaluation_strategy="steps", # 로그 스텝마다 모델 평가
        eval_steps=10,               # 10 스텝마다 평가하고 체크포인트 저장
        do_eval=True,                # 훈련 종료 시 평가 수행
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # 경고 비활성화. 추론 시에는 다시 활성화하세요!
trainer.train()

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left"
)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [None]:
prompt = """
<s>
[INST] Given a question and some additional context, provide an answer. [/INST]

### Question:
Who wrote "Generative AI on AWS" by O'Reilly Media?

### Context: 
Chris Fregly and Antje Barth wrote "Data Science on AWS" by O'Reilly Media.
Chris Fregly, Antje Barth, and Shelbee Eigenbrode wrote "Generative AI on AWS" by O'Reilly Media.

### Response:</s>
"""

model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=32)[0], skip_special_tokens=True))

# PEFT 모델 로드 및 추론 수행

In [None]:
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# base_model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# base_model = AutoModelForCausalLM.from_pretrained(
#     base_model_id, 
#     quantization_config=bnb_config,
#     device_map="auto"
# )

In [None]:
# from peft import PeftModel

# peft_model = PeftModel.from_pretrained(base_model, f"{output_dir}/checkpoint-10")
# #peft_model.eval()

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(
#     base_model_id,
# #    add_bos_token=True
# )
# tokenizer.pad_token = tokenizer.eos_token

# eval_tokenizer = AutoTokenizer.from_pretrained(
#     base_model_id,
#     add_bos_token=True,
# )

In [None]:
# prompt = """

# [INST] Given a question and some additional context, provide an answer. [/INST]

# ### Question:
# Who wrote "Generative AI on AWS" by O'Reilly Media?

# ### Context: 
# Chris Fregly and Antje Barth wrote "Data Science on AWS" by O'Reilly Media.
# Chris Fregly, Antje Barth, and Shelbee Eigenbrode wrote "Generative AI on AWS" by O'Reilly Media.

# ### Response:

# """

# model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

# with torch.no_grad():
#     print(tokenizer.decode(peft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))