In [None]:
%pip install --upgrade transformers

In [None]:
%pip install transformers==5.0.0rc1

In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
import transformers
transformers.__version__
from transformers import TokenizersBackend

In [3]:
import os
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"PyTorch CUDA Version: {torch.version.cuda}")
print(f"CuDNN Version:        {torch.backends.cudnn.version()}")

PyTorch Version: 2.9.1+cu128
PyTorch CUDA Version: 12.8
CuDNN Version:        91002


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from transformers import FineGrainedFP8Config
from transformers import Mistral3ForConditionalGeneration, Mistral3Config, PixtralVisionConfig, MistralConfig

#  4-Bit Quantization Config 
bnb4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

bnb8_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

# use for 8B model
q8_config = FineGrainedFP8Config()


def print_model_specs(model):
    # 1. Get the number of parameters
    # set only_trainable=False to include frozen weights (e.g. if using LoRA)
    total_params = model.num_parameters(only_trainable=False)
    
    # 2. Get the memory footprint (size of weights in VRAM/RAM)
    # This method is specific to Hugging Face models
    memory_bytes = model.get_memory_footprint()
    
    # Format for readability
    print(f"Model Parameters: {total_params / 1_000_000_000:.2f} Billion")
    print(f"Memory Footprint: {memory_bytes / 1024**3:.2f} GB")

In [5]:
import time
import torch
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM

import torch
import gc

# Delete previous model variables if they exist
# (Change 'model' to whatever variable name you used before)
try:
    del model
except NameError:
    pass

# Force Python garbage collection
gc.collect()

# Force PyTorch to release cache
torch.cuda.empty_cache()

max_memory_mapping = {0: "8GiB", "cpu": "48GiB"}


# --- 1. Setup Device ---
# Strix Halo (8060S) works best with float16 on ROCm 6.2+
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16  
    print(f"✅ GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    dtype = torch.float32
    print("⚠️  GPU Not Detected. CPU mode.")


model_id = "mistralai/Ministral-3-3B-Instruct-2512"
model_id = "mistralai/Ministral-3-8B-Instruct-2512"
model_id = "mistralai/Ministral-3-14B-Instruct-2512-BF16"

print(f"\nLoading {model_id}...")

# TRUST_REMOTE_CODE=True is the key fix here
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)

model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id, 
#    torch_dtype=dtype,    # Standard HF uses torch_dtype, but some custom models prefer dtype
    trust_remote_code=True, # Allow the model to define its own config class
#    device_map=device,       # Auto-moves to GPU
    device_map="auto",       # load model in system ram then move to GPU after quantization
    quantization_config=bnb4_config,
    max_memory=max_memory_mapping,
    low_cpu_mem_usage=True
)

print("Model loaded successfully!")
print(f"GPU Memory Used: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

# --- 3. Run Inference ---
messages = [
    {"role": "user", "content": "Tell me a short story."}
]

# Apply Mistral's chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Tokenize and move to device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

generation_kwargs = dict(
    inputs=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    streamer=streamer,
    max_new_tokens=300,    
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

print(f"\nPrompt: {messages[0]['content']}")
print("-" * 30)

t0 = time.time()
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# --- 4. Stream Output ---
generated_text = ""
first_token_received = False
ttft = 0

for new_text in streamer:
    if not first_token_received:
        ttft = time.time() - t0
        first_token_received = True
        print(new_text, end="", flush=True)
    else:
        print(new_text, end="", flush=True)
    generated_text += new_text

t_end = time.time()

# --- 5. Stats ---
total_new_tokens = len(tokenizer.encode(generated_text))
decoding_time = t_end - (t0 + ttft)

print("\n" + "-" * 30)
print(f"Time to First Token: {ttft:.4f} s")
if decoding_time > 0:
    print(f"Generation Speed:    {(total_new_tokens-1)/decoding_time:.2f} tokens/sec")
print(f"Total Tokens:        {total_new_tokens}")
print_model_specs(model)

✅ GPU Detected: NVIDIA GeForce RTX 4070 Ti

Loading mistralai/Ministral-3-14B-Instruct-2512-BF16...




Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/585 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB. GPU 0 has a total capacity of 11.60 GiB of which 136.81 MiB is free. Including non-PyTorch memory, this process has 11.43 GiB memory in use. Of the allocated memory 11.19 GiB is allocated by PyTorch, and 23.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)