# Mixtral 8x7B - Mixture of Experts
- A100 이상에서 무난히 실행 가능
- Fine Tuning 포함

In [None]:
# install packages

!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets scipy
!pip install -q trl
!pip install flash-attn --no-build-isolation

## Load the Base Model
- 4bit 양자화 모델 불러오기

In [None]:
# Load dataset for finetuning

from huggingface_hub import notebook_login
notebook_login()

In [None]:
base_model = "mistralai/Mixtral-8x7B-v0.1"

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False,
    attn_implementation="flash_attention_2"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=512,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [None]:
prompt="""[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM. \nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[\INST]"""

generate_response(prompt, model)

In [None]:
print(model)

In [None]:
# 코드를 생성할 수 있는 데이터 셋
from datasets import load_dataset

dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset

In [None]:
df = dataset.to_pandas()
df.head(10)

In [None]:
# Formatting the Dataset

def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
    # Samples with additional context into.
    if data_point['input']:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} [/INST]{data_point["output"]}</s>"""
    # Without
    else:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} [/INST]{data_point["output"]} </s>"""
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [None]:
dataset = dataset.shuffle(seed=1234)  # 데이터 셋 섞기
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
train_data

In [None]:
train_data["input_ids"][:10]

### Fomatting 이후 예시
``` json
{
"text":"<s>[INST] Create a function to calculate the sum of a sequence of integers. here are the inputs [1, 2, 3, 4, 5] [/INST]
# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum</s>",
"instruction":"Create a function to calculate the sum of a sequence of integers",
"input":"[1, 2, 3, 4, 5]",
"output":"# Python code def sum_sequence(sequence): sum = 0 for num in,
 sequence: sum += num return sum"
"prompt":"<s>[INST] Create a function to calculate the sum of a sequence of integers. here are the inputs [1, 2, 3, 4, 5] [/INST]
# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum</s>"

}
```

In [None]:
print(test_data)

In [None]:
# 실제 실행 시 하기 내용 삭제
test_data = test_data.select(range(100))
train_data = train_data.select(range(100))

## Training Setup

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
        target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
# 훈련 가능한 파라미터 정보 출력
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

### Hyper-parameters for training

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    print(torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    max_seq_length= None,
    tokenizer=tokenizer,
    args=args,
    packing= False,
    dataset_text_field="prompt",
    train_dataset=train_data,
    eval_dataset=test_data
)


In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()

In [None]:
trainer.save_model("Mixtral_Alpace_v2")

## Save Model and Push to Hub

In [None]:
# !pip install huggingface-hub -qU
# from huggingface_hub import notebook_login

# notebook_login()

# trainer.push_to_hub("Promptengineering/mistral-instruct-generation")

In [None]:
merged_model = model.merge_and_unload()

In [None]:
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to('cuda')

    generated_ids = model.generate(**model_inputs,
                                   max_new_tokens=150,
                                   do_sample=True,
                                   pad_token_id=tokenizer.eos_token_id)
    
    decoded_output = tokenizer.batch_decode(generated_ids)

    return decoded_output[0]

In [None]:
#prompt = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM.\nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[/INST]"
prompt = "[INST]Make me a bubble-sort program[/INST]"

In [None]:
generate_response(prompt, merged_model)