# Hugging Face Model
##### uv add huggingface_hub
##### uv add transformers accelerate
##### uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
##### # uv add -U bitsandbytes (-U : )

In [1]:
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import login
import os

login(token=os.getenv('HUGGINGFACE_API_KEY'))

In [2]:
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, Gemma3ForCausalLM

print(torch.__version__)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

2.8.0+cu126
cuda


### https://huggingface.co/google/gemma-3-1b-pt : 전체

### https://huggingface.co/google/gemma-3-1b-it : 말투나 방식


In [7]:
# PT

ckpt = "google/gemma-3-1b-pt"

tokenizer = AutoTokenizer.from_pretrained(ckpt)

model = Gemma3ForCausalLM.from_pretrained(
    ckpt,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

prompt = "Eiffel tower is located in"
model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

print(model_inputs)
print('-' * 100)

input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
    generation = generation[0][input_len:]

decoded = tokenizer.decode(generation, skip_special_tokens=True)
print(decoded)

{'input_ids': tensor([[     2, 236788,  80880,  18515,    563,   5628,    528]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
----------------------------------------------------------------------------------------------------
 the heart of Paris, France.The Eiffel Tower is a 324-meter-high tower in Paris, France.The Eiffel Tower is a symbol of Paris and France.The Eiffel Tower is a symbol of Paris and France


In [8]:
# IT

model_id = "google/gemma-3-1b-it"

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = Gemma3ForCausalLM.from_pretrained(
    model_id, quantization_config=quantization_config
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."},]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"},]
        },
    ],
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device).to(torch.bfloat16)

print('input')
print(input)


with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=64)

outputs = tokenizer.batch_decode(outputs)
print('-' * 100)
print('output')
print(outputs)

Attempting to cast a BatchEncoding to type torch.bfloat16. This is not supported.


input
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x00000289E249B3D0>>
----------------------------------------------------------------------------------------------------
output
['<bos><start_of_turn>user\nYou are a helpful assistant.\n\nWrite a poem on Hugging Face, the company<end_of_turn>\n<start_of_turn>model\nOkay, here’s a poem about Hugging Face, aiming to capture its spirit and purpose:\n\n**The Algorithm’s Heart**\n\nIn a world of code, a digital space,\nWhere models bloom with elegant grace,\nHugging Face arises, bright and bold,\nA community, a story to']
