In [2]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF
import pickle

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    LlamaForCausalLM, 
    GemmaForCausalLM
)
from accelerate import Accelerator
from torchinfo import summary

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 4비트 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 모델 ID 
#model_id = "beomi/llama-2-ko-7b"
model_id = "beomi/gemma-ko-7b"
# 토크나이저 로드 및 설정
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False

# 모델 로드 및 양자화 설정 적용
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True )

#model = LlamaForCausalLM.from_pretrained(
#    model_id,
#    quantization_config=bnb_config,
#    device_map="auto",
#    trust_remote_code=True
#)

#model = GemmaForCausalLM.from_pretrained(
#    model_id,
#    quantization_config=bnb_config,
#    device_map="auto",
#    trust_remote_code=True
#)

#print(model)
#for name, param in model.named_parameters():
#    print(name, param.requires_grad)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:  17%|█▋        | 1/6 [00:54<04:30, 54.11s/it]

In [3]:
# 전체 파라미터 수 및 학습 가능한 파라미터 수 계산
total_parameters = sum(p.numel() for p in model.parameters())
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total number of parameters: {total_parameters}")
print(f"Trainable parameters: {trainable_parameters}")

# 대략적인 모델 크기 계산 (바이트 단위)
model_size_mb = total_parameters * 4 / (1024 ** 2)  # float32 기준
print(f"Approximate model size (float32): {model_size_mb:.2f} MB")

# 대략적인 모델 크기 계산 (바이트 단위)
model_size_mb = total_parameters * 0.5 / (1024 ** 2)  # 4비트 기준
print(f"Approximate model size (4bits): {model_size_mb:.2f} MB")

Total number of parameters: 1515268096
Trainable parameters: 524363776
Approximate model size (float32): 5780.29 MB
Approximate model size (4bits): 722.54 MB


In [4]:
# 모델 구조 출력
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [5]:
#num_layers = 32
#target_module_list = []
#
#for i in range(num_layers - 12, num_layers):
#    target_module_list.extend([
#        f"model.layers.{i}.self_attn.q_proj",
#        f"model.layers.{i}.self_attn.k_proj",
#        f"model.layers.{i}.self_attn.v_proj",
#        f"model.layers.{i}.self_attn.o_proj",
#        f"model.layers.{i}.mlp.gate_proj",
#        f"model.model.layers.{i}.mlp.up_proj",
#        f"model.layers.{i}.mlp.down_proj",
#    ])
#for i in range(5):
#    target_module_list.extend([
#        f"model.layers.{i}.self_attn.q_proj",
#        f"model.layers.{i}.self_attn.k_proj",
#        f"model.layers.{i}.self_attn.v_proj",
#        f"model.layers.{i}.self_attn.o_proj",
#        f"model.layers.{i}.mlp.gate_proj",
#        f"model.model.layers.{i}.mlp.up_proj",
#        f"model.layers.{i}.mlp.down_proj",
#    ])

lora_config = LoraConfig(
    r=4,  
    lora_alpha=32, 
    lora_dropout=0.3,  
    #target_modules=target_module_list,  
    task_type = "QUESTION_ANS",
    bias="none",
)

model = get_peft_model(model, lora_config)
#for name, param in model.named_parameters():
#    print(name, param.requires_grad)

In [7]:
# 모델 구조 출력
print(model)

PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.3, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
  

In [8]:
total_parameters = sum(p.numel() for p in model.parameters())
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total number of parameters: {total_parameters}")
print(f"Trainable parameters: {trainable_parameters}")

Total number of parameters: 1515728896
Trainable parameters: 460800


In [9]:
def setup_llm_pipeline():
    # HuggingFacePipeline 객체 생성
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        #task="LlamaForCausalLM",
        temperature=0.2,
        return_full_text=False,
        max_new_tokens=128, 
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

    for name, param in model.named_parameters():
        print(name, param.requires_grad)
        
    return hf

In [10]:
# LLM 파이프라인
llm = setup_llm_pipeline()

The model 'PeftModelForQuestionAnswering' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptFor

base_model.model.model.embed_tokens.weight False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.k_proj.weight False
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight True
base_model.model.model.layers.0.self_attn.o_proj.weight False
base_model.model.model.layers.0.mlp.gate_proj.weight False
base_model.model.model.layers.0.mlp.up_proj.weight False
base_model.model.model.layers.0.mlp.down_proj.weight False
base_model.model.model.layers.0.input_layernorm.weight False
base_model.model.model.layers.0.post_attention_layernorm.weight False
base_model.model.model.layers.1.self_attn.q_proj.base_layer.weight Fals

  warn_deprecated(


In [None]:
# 전체 파라미터 수 및 학습 가능한 파라미터 수 계산
total_parameters = sum(p.numel() for p in llm.parameters())
trainable_parameters = sum(p.numel() for p in llm.parameters() if p.requires_grad)

print(f"Total number of parameters: {total_parameters}")
print(f"Trainable parameters: {trainable_parameters}")