# Fine-tuning with Quantization

## Setup

!python --version
!nvidia-smi

In [2]:
!pip3 install torch --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch
  Using cached https://download.pytorch.org/whl/cu124/torch-2.5.1%2Bcu124-cp310-cp310-linux_x86_64.whl (908.3 MB)
Collecting nvidia-cusolver-cu12==11.6.1.9
  Using cached https://download.pytorch.org/whl/cu124/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
Collecting nvidia-cuda-cupti-cu12==12.4.127
  Using cached https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127
  Using cached https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
Collecting nvidia-curand-cu12==10.3.5.147
  Using cached https://download.pytorch.org/whl/cu124/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)
Collecting sympy==1.13.1
  Using cached https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collectin

In [None]:
%pip install -q -U transformers==4.38.2
%pip install -q -U datasets==2.18.0
%pip install -q -U bitsandbytes==0.42.0
%pip install -q -U peft==0.9.0
%pip install -q -U trl==0.7.11
%pip install -q -U accelerate==0.27.2
%pip install --upgrade huggingface_hub

%pip install python-dotenv
%pip install ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments,
    pipeline, 
    logging
    )
from peft import(
    LoraConfig, 
    PeftModel, 
    get_peft_model
    )
from trl import SFTTrainer

import os
from dotenv import load_dotenv
from huggingface_hub import login

# load .env
load_dotenv()

# HUGGINGFACE_ACCESS_TOKEN = os.environ.get('HUGGINGFACE_ACCESS_TOKEN')
# login(HUGGINGFACE_ACCESS_TOKEN)

BASE_MODEL = "google/gemma-2b-it"
ADAPTER_MODEL = "qlora_adapter"

print(torch.cuda.get_device_capability()[0])

AssertionError: Torch not compiled with CUDA enabled

In [3]:
data_files = {"train": "data/train.csv", "test": "data/test.csv"}
custom_dataset = load_dataset("csv", data_files=data_files)
print(custom_dataset['train'][:1])
print(custom_dataset['test'][:1])

custom_dataset

{'request': ['항상 사무실에서 가장 먼저 전화를 받는 사람에게 상을 주고 싶어.'], 'title': ['전화 응답왕상'], 'winner': ['김전화'], 'description': ['항상 전화가 울리기 무섭게 먼저 받아내는 그의 빠른 대처 능력을 칭찬하기 위해 이 상을 수여합니다.'], 'publisher': ['빠른 대응 전문가 협회']}
{'request': ['항상 책을 읽으며 점심 시간을 보내는 사람에게 상을 주고 싶어.'], 'title': ['점심 독서가상'], 'winner': ['이독서'], 'description': ['항상 점심 시간에 책을 읽으며 지식을 쌓는 그의 열정을 칭찬하기 위해 이 상을 수여합니다.'], 'publisher': ['사내 독서 장려회']}


DatasetDict({
    train: Dataset({
        features: ['request', 'title', 'winner', 'description', 'publisher'],
        num_rows: 200
    })
    test: Dataset({
        features: ['request', 'title', 'winner', 'description', 'publisher'],
        num_rows: 4
    })
})

In [4]:
def generate_prompt(example):
    prompt_list = []
    for i in range(len(example['title'])):
        prompt_list.append(
            f"""<bos><start_of_turn>user
다음 요청을 고려해서 아래의 예시처럼 상장 문구를 작성해주세요:

한국어로 작성해.

[요청]
{example['request'][i]}<end_of_turn>
<start_of_turn>model
{{'title':'{example['title'][i]}', 'winner':'{example['winner'][i]}', 'description':'{example['description'][i]}', 'publisher':'{example['publisher'][i]}'}}<end_of_turn><eos>"""
)
    return prompt_list

train_data = custom_dataset['train']
print(generate_prompt(train_data[:1])[0])

<bos><start_of_turn>user
다음 요청을 고려해서 아래의 예시처럼 상장 문구를 작성해주세요:

한국어로 작성해.

[요청]
항상 사무실에서 가장 먼저 전화를 받는 사람에게 상을 주고 싶어.<end_of_turn>
<start_of_turn>model
{'title':'전화 응답왕상', 'winner':'김전화', 'description':'항상 전화가 울리기 무섭게 먼저 받아내는 그의 빠른 대처 능력을 칭찬하기 위해 이 상을 수여합니다.', 'publisher':'빠른 대응 전문가 협회'}<end_of_turn><eos>


In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
    )


model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    )
model.config.use_cache = False
model.config.pretraining_tp = 1
print(model.get_memory_footprint())


tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL, 
    add_special_tokens=True
    )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


print(BASE_MODEL)
print(ADAPTER_MODEL)
print(model)

NameError: name 'BitsAndBytesConfig' is not defined

In [6]:
lora_config = LoraConfig(
    r=8,
    lora_alpha = 16,
    lora_dropout = 0.1,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM"
)

model = get_peft_model(model=model, peft_config=lora_config)
print(model.print_trainable_parameters())
print(model.get_memory_footprint())

trainable params: 9,805,824 || all params: 2,515,978,240 || trainable%: 0.3897420034920493
None
2145964032


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        # num_train_epochs = 1,
        max_steps=300,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=0.03,
        learning_rate=2e-4,
        # fp16=False,
        # bf16=True,
        logging_steps=50,
        save_steps=50,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)



In [8]:
trainer.train()

Step,Training Loss
50,1.4907
100,0.5404
150,0.3241
200,0.2001
250,0.1401
300,0.1132


TrainOutput(global_step=300, training_loss=0.4681075604756673, metrics={'train_runtime': 336.4223, 'train_samples_per_second': 3.567, 'train_steps_per_second': 0.892, 'total_flos': 2008699274649600.0, 'train_loss': 0.4681075604756673, 'epoch': 6.0})

In [9]:
pipe_finetuned = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer, max_new_tokens=512)

prompt = '<bos><start_of_turn>user\n다음 요청을 고려해서 아래의 예시처럼 상장 문구를 작성해주세요:\n한국어 단어만 사용할 것. 영어를 사용하면 안돼.\n[요청]\n항상 새로운 소프트웨어를 개발하는 개발자에게 상을 주고 싶어<end_of_turn>\n<start_of_turn>model\n'
outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.75,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

{'title':'코드 마법사상', 'winner':'최개발', 'description':'혁신적인 소프트웨어를 개발해 세상을 변화시키는 그의 뛰어난 개발 능력을 칭찬하기 위해 이 상을 수여합니다.', 'publisher':'소프트웨어 개발 협회'}


In [10]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_la

In [11]:
trainer.model.get_memory_footprint()

2145973248

In [12]:
trainer.model.save_pretrained(ADAPTER_MODEL)

## Combining models

In [None]:
# import torch
# import gc
# torch.cuda.empty_cache()
# del trainer, model
# gc.collect()

# base_model = AutoModelForCausalLM.from_pretrained(
#     BASE_MODEL,
#     quantization_config=quantization_config,
#     device_map="cuda",
#     torch_dtype=torch.float16
#     # low_cpu_mem_usage=True,
#     # torch_dtype=torch.bfloat16,
#     )
# base_model.config.use_cache = False

# integrated_model = PeftModel.from_pretrained(
#     model=base_model,
#     model_id=ADAPTER_MODEL, 
#     ) # LoRA 가중치를 가져와 기본 모델에 통합



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Evaluate

In [None]:
BASE_MODEL = "google/gemma-2b-it"
FINETUNE_MODEL = "gemma-2b-it-award-factory-v2"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [3]:
finetune_model.get_memory_footprint()

10091798528

In [4]:
pipe_finetuned = pipeline(
    "text-generation", 
    model=finetune_model, 
    tokenizer=tokenizer, 
    max_new_tokens=2048
    )

prompt = '<bos><start_of_turn>user\n다음 요청을 고려해서 아래의 예시처럼 상장 문구를 작성해주세요:\n한국어 단어만 사용할 것. 영어를 사용하면 안돼.\n[요청]\n항상 새로운 소프트웨어를 개발하는 개발자에게 상을 주고 싶어<end_of_turn>\n<start_of_turn>model\n'
outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.5,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

{'title':'코드 마법사상', 'winner':'박개발', 'description':'혁신적인 소프트웨어를 개발해 세상을 변화시키는 그의 뛰어난 개발 능력을 칭찬하기 위해 이 상을 수여합니다.', 'publisher':'소프트웨어 개발 협회'}
