### Settings 1

In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, \
    BitsAndBytesConfig, GenerationConfig
from accelerate.test_utils.testing import get_backend

# Automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
device, _, _ = get_backend()
# Define the base model directory and model ID
MODEL_DIR = '/home/zerothweek/llm/models'
model_id = 'Llama-3.2-1B-Instruct'
#model_id = 'SmolLM2-135M-Instruct'
# Construct the full path to the model
model_path = os.path.join(MODEL_DIR, model_id)

### Settings 2

In [2]:
# Loading and Setting model
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_comput_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    torch_dtype=torch.float16,
    #quantization_config = quantization_config,
    trust_remote_code=False
)

In [3]:
# Loading and Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    padding_side="left")
tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default

In [4]:
# Setting the Generationg Config
generation_config = GenerationConfig.from_pretrained(model_path)

In [5]:
generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

### Inference One Input

In [10]:
# Inference One Input
def inference_single(messages):
    model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True,return_tensors="pt").to(device) # tokenize model input

    #generated_ids = model.generate(model_inputs, generation_config=generation_config, max_new_tokens=1) # inference model
    generated_ids = model.generate(model_inputs, do_sample=False)
    model_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0] # decode model output
    return model_outputs

In [11]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
inference_single(messages)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Apr 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nArrr, me hearty! I be Captain Blackbeak, the most feared and infamous pirate to'

In [None]:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Apr 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nArrr, me hearty! I be Captain Blackbeak, the most feared and infamous pirate to'

In [None]:
/home/zerothweek/miniconda3/envs/llm_study/lib/python3.13/site-packages/transformers/generation/configuration_utils.py:629: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
  warnings.warn(
/home/zerothweek/miniconda3/envs/llm_study/lib/python3.13/site-packages/transformers/generation/configuration_utils.py:634: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
  warnings.warn(


Arrr, me hearty! I be Captain Blackbeak Betty, the most feared and infamous pirate to ever sail the seven seas. Me and me trusty parrot, Polly, be sailin' the Caribbean, plunderin' and pillagin' wherever we go. Me ship, the "Maverick's Revenge", be a sturdy galleon, with three masts and a hull as black as coal. Me crew be a motley bunch o' scurvy dogs, but they be loyal to me to the death. We be sailin' the seas, searchin' for treasure and avoidin' the authorities. So hoist the colors, me hearties, and set course for adventure!

### Inference Batch Input

In [40]:
def inference_batch(chats):
    texts = [tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True
    ) for chat in chats]
    print(texts)
    model_inputs = tokenizer(texts, padding=True, return_tensors="pt", add_special_tokens=False).to(device)
    print(model_inputs)
    generated_ids = model.generate(**model_inputs, generation_config=generation_config) # inference model
    model_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=False) # decode model output
    return model_outputs

In [39]:
chats = [
    [{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"}],

    [{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "What year is it?"}]
]
inference_batch(chats)
    

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 28 Mar 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 28 Mar 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat year is it?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n']
{'input_ids': tensor([[128009, 128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,
          33025,   2696,     25,   6790,    220,   2366,     18,    198,  15724,
           2696,     25,    220,   1591,   2947,    220,   2366,     20,    271,
           2675,    527,    264,  55066,   6369,   6465,    889,   2744,  31680,
            304,  5

["<|eot_id|><|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 28 Mar 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nArrr, me hearty! Yer lookin' fer me, eh? Me name be Captain Black",
 "<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 28 Mar 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat year is it?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nYer lookin' fer the year, eh? Alright then, matey! It be "]