### Settings 1

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, \
    BitsAndBytesConfig, GenerationConfig
from accelerate.test_utils.testing import get_backend

# Automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
device, _, _ = get_backend()
# Define the base model directory and model ID
MODEL_DIR = '/home/zerothweek/llm/models'
model_id = 'Llama-3.2-1B'
# Construct the full path to the model
model_path = os.path.join(MODEL_DIR, model_id)

### Settings 2

In [3]:
# Loading and Setting model
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_comput_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    torch_dtype=torch.float16,
    #quantization_config = quantization_config,
    trust_remote_code=False
)

In [4]:
# Loading and Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    padding_side="left")
tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default

In [5]:
# Setting the Generationg Config
generation_config = GenerationConfig.from_pretrained(model_path)

In [6]:
generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "max_length": 10,
  "temperature": 0.6,
  "top_p": 0.9
}

### Inference One Input

In [5]:
# Inference One Input
def inference_single(input):
    model_inputs = tokenizer(input, return_tensors="pt").to(device) # tokenize model input
    generated_ids = model.generate(**model_inputs, generation_config = generation_config) # inference model
    model_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # decode model output
    return model_outputs

In [6]:
inference_single(["A list of colors: red, blue"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'A list of colors: red, blue,'

### Inference Batch Input

In [None]:
def inference_batch(input):
    model_inputs = tokenizer(input, return_tensors="pt", padding=True).to(device)
    generated_ids = model.generate(**model_inputs, generation_config=generation_config)
    model_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
    return model_outputs
inference_batch(['A list of colors:', 'Portugal is'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'input_ids': tensor([[128000,     32,   1160,    315,   8146,     25],
        [128001, 128001, 128000,   7229,  45284,    374]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1]])}


['<|begin_of_text|>A list of colors: A list of colors',
 '<|end_of_text|><|end_of_text|><|begin_of_text|>Portugal is a country in Western']