# Linear Quantitzation 예제

In [None]:
!pip install bitsandbytes==0.43.1
!pip install accelerate==0.30.1
!pip install transformers==4.39.3

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
# 라마 모델은 허깅페이스에서 동의를 해야하므로 사이트에 접속 후에 동의필요
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)

config = BitsAndBytesConfig(
    load_in_4bit=True,                    # 4비트로 처리
    bnb_4bit_quant_type="nf4",            # 4비트로 처리할 때 가장 효율이 좋은 타입 (QLoRA에서도 이용함)
    bnb_4bit_use_double_quant=True,       
    bnb_4bit_compute_dtype=torch.bfloat16 
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",         
    device_map="auto",          
    quantization_config=config  
)

# 단일 추론

In [None]:
# 추론
messages = [
    {"role": "system", "content": "You are a kind robot."},
    {"role": "user", "'content": "이순신이 누구야?"}
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    tokenize=True
).to(model.device)

print(input_ids)

In [None]:
print(tokenizer.decode(input_ids[0]))

In [None]:
# 종료 토큰 정의
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
outputs = model.generate(
    input_ids=input_ids,
    max_new_tokens=300,
    eos_token_id=terminators,
    do_sample=True, # 같은 프롬프트를 넣어도 다른 예시를 보여줌
    temperature=0.7,
    no_repeat_ngram_size=2,
    pad_token_id=tokenizer.eos_token_id
)
# 인풋 프롬프트 제외 답변 토큰 추출
response = outputs[0][input_ids.shape[-1]:]

# 디코드
print("response : ", tokenizer.decode(response, skip_special_tokens=True))

# 배치 추론
- CPU가 많이 필요하므로 T4로 불가함

In [None]:
messages1 = [
    {"role": "system", "content": "You ar a kind robot."},
    {"role": "user", "content": "이순신이 누구야?"}
]

prompt1 = tokenizer.apply_chat_template(
    messages1,
    add_generation_prompt=True,
    return_tensors="pt",
    toeknize=False
)

print(prompt1)

In [None]:
messages2 = [
    {"role": "system", "content": "You are a kind robot."},
    {"role": "user", "content": "세종대왕이 누구야?"}
]

prompt2 = tokenizer.apply_chat_template(
    messages2,
    add_generation_prompt=True,
    return_tensors="pt",
    tokenize=False
)

print(prompt2)

In [None]:
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

prompt_batch = [prompt1, prompt2]
input_ids_batch = tokenizer(prompt_batch, return_tensors='pt',padding="longest")['input_ids']

In [None]:
# Define a list of terminators, which specifies the end-of-sequence token ID
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
outputs = model.generate(
    input_ids=input_ids_batch,           # Provide the tokenized input prompt
    max_new_tokens=30,                  # Specify the maximum number of new tokens to generate
    eos_token_id=terminators,            # Specify the end-of-sequence token ID to stop generation
    do_sample=True,                      # Enable stochastic mode (randomness)
    temperature=0.7,                     # Control the randomness of predictions. Higher values make the output more random, lower values make it more deterministic.
    no_repeat_ngram_size=2,              # Prevent repetition of 2-gram sequences. This option prevents the model from repeating the same sequence of 2 tokens (words) more than once.
    pad_token_id=tokenizer.eos_token_id  # Use the end-of-sequence token as the padding token.
)


In [None]:
# 각 배치에 대한 응답 추출 및 디코딩
for i, output in enumerate(outputs):
    response = output[input_ids_batch[i].shape[-1]:]
    print(f"response {i + 1}: ", tokenizer.decode(response, skip_special_tokens=True))