In [None]:
%pip install --upgrade transformers

In [9]:
%pip install transformers==5.0.0rc1

Collecting transformers==5.0.0rc1
  Downloading transformers-5.0.0rc1-py3-none-any.whl.metadata (37 kB)
Collecting huggingface-hub<2.0,>=1.2.1 (from transformers==5.0.0rc1)
  Using cached huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting typer-slim (from transformers==5.0.0rc1)
  Downloading typer_slim-0.21.0-py3-none-any.whl.metadata (16 kB)
Collecting shellingham (from huggingface-hub<2.0,>=1.2.1->transformers==5.0.0rc1)
  Using cached shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting click>=8.0.0 (from typer-slim->transformers==5.0.0rc1)
  Using cached click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Downloading transformers-5.0.0rc1-py3-none-any.whl (9.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m80.2 MB/s[0m  [33m0:00:00[0m
[?25hUsing cached huggingface_hub-1.2.3-py3-none-any.whl (520 kB)
Using cached shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)
Downloading typer_slim-0.21.0-py3-none-an

In [1]:
import transformers
transformers.__version__
from transformers import TokenizersBackend

In [2]:
import os
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"PyTorch CUDA Version: {torch.version.cuda}")
print(f"CuDNN Version:        {torch.backends.cudnn.version()}")

PyTorch Version: 2.9.1+cu128
PyTorch CUDA Version: 12.8
CuDNN Version:        91002


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

#  4-Bit Quantization Config (Fits in ~6GB VRAM)
bnb4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

bnb8_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [6]:
from transformers import Mistral3ForConditionalGeneration, Mistral3Config, PixtralVisionConfig, MistralConfig

In [5]:
import time
import torch
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM

# --- 1. Setup Device ---
# Strix Halo (8060S) works best with float16 on ROCm 6.2+
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16  
    print(f"✅ GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    dtype = torch.float32
    print("⚠️  GPU Not Detected. CPU mode.")

model_id = "gpt2"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id = "mistralai/Ministral-3-3B-Instruct-2512"
model_id = "Qwen/Qwen2.5-3B-Instruct"
model_id = "polyverse/Meta-Llama-3.1-8B-Q5_K_M-GGUF"
model_id = "mistralai/Ministral-3-3B-Instruct-2512"
model_id = "mistralai/Ministral-3-8B-Instruct-2512"

print(f"\nLoading {model_id}...")

# TRUST_REMOTE_CODE=True is the key fix here
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=dtype,    # Standard HF uses torch_dtype, but some custom models prefer dtype
    trust_remote_code=True, # Allow the model to define its own config class
    device_map=device       # Auto-moves to GPU
)

# --- 3. Run Inference ---
messages = [
    {"role": "user", "content": "Tell me a short story."}
]

# Apply Mistral's chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Tokenize and move to device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

generation_kwargs = dict(
    inputs=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    streamer=streamer,
    max_new_tokens=300,    
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

print(f"\nPrompt: {messages[0]['content']}")
print("-" * 30)

t0 = time.time()
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# --- 4. Stream Output ---
generated_text = ""
first_token_received = False
ttft = 0

for new_text in streamer:
    if not first_token_received:
        ttft = time.time() - t0
        first_token_received = True
        print(new_text, end="", flush=True)
    else:
        print(new_text, end="", flush=True)
    generated_text += new_text

t_end = time.time()

# --- 5. Stats ---
total_new_tokens = len(tokenizer.encode(generated_text))
decoding_time = t_end - (t0 + ttft)

print("\n" + "-" * 30)
print(f"Time to First Token: {ttft:.4f} s")
if decoding_time > 0:
    print(f"Generation Speed:    {(total_new_tokens-1)/decoding_time:.2f} tokens/sec")
print(f"Total Tokens:        {total_new_tokens}")

✅ GPU Detected: NVIDIA GeForce RTX 4070 Ti

Loading mistralai/Ministral-3-8B-Instruct-2512...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.mistral3.configuration_mistral3.Mistral3Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of AfmoeConfig, ApertusConfig, ArceeConfig, AriaTextConfig, BambaConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BitNetConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, BltConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, CpmAntConfig, CTRLConfig, CwmConfig, Data2VecTextConfig, DbrxConfig, DeepseekV2Config, DeepseekV3Config, DiffLlamaConfig, DogeConfig, Dots1Config, ElectraConfig, Emu3Config, ErnieConfig, Ernie4_5Config, Ernie4_5_MoeConfig, Exaone4Config, FalconConfig, FalconH1Config, FalconMambaConfig, FlexOlmoConfig, FuyuConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, Gemma3nConfig, Gemma3nTextConfig, GitConfig, GlmConfig, Glm4Config, Glm4MoeConfig, GotOcr2Config, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GptOssConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, GraniteMoeHybridConfig, GraniteMoeSharedConfig, HeliumConfig, HunYuanDenseV1Config, HunYuanMoEV1Config, JambaConfig, JetMoeConfig, Lfm2Config, Lfm2MoeConfig, LlamaConfig, Llama4Config, Llama4TextConfig, LongcatFlashConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegatronBertConfig, MiniMaxConfig, MinistralConfig, Ministral3Config, MistralConfig, MixtralConfig, MllamaConfig, ModernBertDecoderConfig, MoshiConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NanoChatConfig, NemotronConfig, OlmoConfig, Olmo2Config, Olmo3Config, OlmoeConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, Phi4MultimodalConfig, PhimoeConfig, PLBartConfig, ProphetNetConfig, Qwen2Config, Qwen2MoeConfig, Qwen3Config, Qwen3MoeConfig, Qwen3NextConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, SeedOssConfig, SmolLM3Config, StableLmConfig, Starcoder2Config, TrOCRConfig, VaultGemmaConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, xLSTMConfig, XmodConfig, ZambaConfig, Zamba2Config.