In [1]:
import warnings
warnings.filterwarnings('ignore')

## 7.2.1 bits and bytes

In [2]:
! pip install -U transformers bitsandbytes



In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "EleutherAI/polyglot-ko-1.3b"

In [4]:
! nvidia-smi

Fri Jan 24 13:46:26 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

prompt = "한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘."

inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    inputs,
    max_length=len(inputs[0]) + 50,  # 최대 출력 길이
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_p=0.9,
    top_k=50
)

response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(f"Prompt: {prompt}")
print(f"Response: {response}")

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: 한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘.
Response:  마지막으로 자기 전에 하루를 돌아보는 대화를 하면 좋겠다. ""오늘 하루 어땠어?"" ""오늘은 정말 재미있는 하루였어"" ""나도 오늘 엄청 재미있었어"" 같은 간단한


In [6]:
! nvidia-smi

Fri Jan 24 13:46:46 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0              68W /  70W |   5369MiB / 15360MiB |     96%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

del tokenizer
del model

In [8]:
! nvidia-smi

Fri Jan 24 13:46:46 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0              39W /  70W |   5335MiB / 15360MiB |     34%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [9]:
# 8비트 양자화 모델 불러오기
bnb_config_8bit = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config_8bit,
    device_map="auto"
)

prompt = "한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘."

inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    inputs,
    max_length=len(inputs[0]) + 50,  # 최대 출력 길이
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_p=0.9,
    top_k=50
)

response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(f"Prompt: {prompt}")
print(f"Response: {response}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Prompt: 한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘.
Response: 그리고 너무 고맙다고 감사인사 해줘.내가 한국말 할 수 있다는 건 큰 축복이야. -------------------------------------------------------------------------------------------- + 나는 정말 영어 공부하기 싫어서 친구한테 말도안돼는 농담을


In [10]:
! nvidia-smi

Fri Jan 24 13:46:52 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0              37W /  70W |   1673MiB / 15360MiB |     34%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

del tokenizer
del model

In [12]:
! nvidia-smi

Fri Jan 24 13:46:53 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0              32W /  70W |   1669MiB / 15360MiB |     28%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
# 4비트 양자화 모델 불러오기
bnb_config_4bit = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config_4bit,
    device_map="auto"
)

prompt = "한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘."

inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    inputs,
    max_length=len(inputs[0]) + 50,  # 최대 출력 길이
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_p=0.9,
    top_k=50
)

response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(f"Prompt: {prompt}")
print(f"Response: {response}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Prompt: 한국어로 반갑게 인사해줘. 10글자 이상으로 길게 해줘.
Response:  2. 한국 문화에 관심을 가져. 우리는 한국인, 일본인, 중국인, 태국인, 필리핀인, 미국인, 유럽인, 그리고 아메리카 인디언의 자손들이고, 한국에 사는 모든 사람들은 전 세계의 모든 사람들


In [14]:
! nvidia-smi

Fri Jan 24 13:46:57 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0              62W /  70W |   1293MiB / 15360MiB |     57%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    