# speechless.generate

In [1]:
import os, json, re

In [16]:
MODEL_ROOT_DIR="/opt/local/llm_models"
# MODEL_PATH=os.path.join(MODEL_ROOT_DIR, "huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
# MODEL_PATH=os.path.join(MODEL_ROOT_DIR, "huggingface.co/nvidia/Llama-3.1-Minitron-4B-Width-Base")
MODEL_PATH=os.path.join(MODEL_ROOT_DIR, "huggingface.co/Qwen/Qwen2-7B")

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

bnb_4bit_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
) 

bnb_config = None # bnb_4bit_config

model_kwargs = {
    "quantization_config": bnb_config,
    "torch_dtype": torch.bfloat16,
    "trust_remote_code": True,
}
model_kwargs["attn_implementation"] = "flash_attention_2"

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, **model_kwargs) 

generate_kwargs = {
    "max_new_tokens": 512,
    "temperature": 0.8,
    "do_sample": True,
    # "min_p": 0.1,
}

# Function to generate text
def generate_text(prompt: str, generate_kwargs: dict) -> str:
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
    generated_text = tokenizer.decode(outputs[0][len(input_ids):], skip_special_tokens=True)
    return generated_text

prompt = "Explain the concept of machine learning in simple terms:"
response = generate_text(prompt, generate_kwargs=generate_kwargs)
print(response)


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 15.46it/s]
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


inputs={'input_ids': tensor([[  840, 20772,   279,  7286,   315,  5662,  6832,   304,  4285,  3793,
            25]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
outputs=tensor([[   840,  20772,    279,   7286,    315,   5662,   6832,    304,   4285,
           3793,     25,  12960,   6832,    374,    264,   1616,    369,  18495,
            311,   3960,    323,   7269,    504,   3139,   2041,   1660,  20975,
          55068,     13,   1084,    594,   1075,  12629,    264,   1682,    311,
          15282,   6171,    553,   9027,   1105,   1657,  10295,     13,    576,
           6366,  46210,    504,    821,     11,  24588,  12624,    323,   3259,
          11181,    476,  19898,   3118,    389,    429,    821,     13,   1084,
            594,    264,   1376,   5440,   4815,   1657,   8357,    582,    990,
           3351,     11,   1075,  27682,   5942,     11,  25328,  13406,     11,
            323,    656,  59711,   9331,     13, 151643]])
plain the concept of m