In [1]:
import os
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"ROCm Version:    {torch.version.hip}")

PyTorch Version: 2.11.0.dev20251222+rocm7.1
ROCm Version:    7.1.52802


In [2]:
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"

In [3]:
os.environ["HF_TOKEN"]=""

In [12]:
%pip install mistral_common --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting mistral_common
  Downloading mistral_common-1.8.8-py3-none-any.whl.metadata (5.3 kB)
Collecting pydantic<3.0,>=2.7 (from mistral_common)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting jsonschema>=4.21.1 (from mistral_common)
  Using cached jsonschema-4.25.1-py3-none-any.whl.metadata (7.6 kB)
Collecting tiktoken>=0.7.0 (from mistral_common)
  Using cached tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Collecting pydantic-extra-types>=2.10.5 (from pydantic-extra-types[pycountry]>=2.10.5->mistral_common)
  Using cached pydantic_extra_types-2.10.6-py3-none-any.whl.metadata (4.0 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3.0,>=2.7->mistral_common)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.41.5 (from pydantic<3.0,>=2.7->mistral_common)
  Using cached pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting typi

In [15]:
%pip install git+https://github.com/huggingface/transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-bmoggqrv
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-bmoggqrv
  Resolved https://github.com/huggingface/transformers to commit d6a6c82680cba9c51decdacac6dd6315ea4a766a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub<2.0,>=1.2.1 (from transformers==5.0.0.dev0)
  Using cached huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting typer-slim (from transformers==5.0.0.dev0)
  Using cached typer_slim-0.20.1-py3-none-any.whl.metadata (16 kB)
Collecting httpx<1,>=0.23.0 (from huggingface-hub<2.0,>=1.2.1->transformers==5.0.0.dev0)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting shellingham (from huggingface-hu

In [1]:
import time
import torch
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM

# --- 1. Setup Device ---
# Strix Halo (8060S) works best with float16 on ROCm 6.2+
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16  
    print(f"✅ GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    dtype = torch.float32
    print("⚠️  GPU Not Detected. CPU mode.")

model_id = "gpt2"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id = "mistralai/Ministral-3-3B-Instruct-2512"
model_id = "Qwen/Qwen2.5-3B-Instruct"

print(f"\nLoading {model_id}...")

# TRUST_REMOTE_CODE=True is the key fix here
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=dtype,    # Standard HF uses torch_dtype, but some custom models prefer dtype
    trust_remote_code=True, # Allow the model to define its own config class
    device_map=device       # Auto-moves to GPU
)

# --- 3. Run Inference ---
messages = [
    {"role": "user", "content": "Tell me a short story."}
]

# Apply Mistral's chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Tokenize and move to device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

generation_kwargs = dict(
    inputs=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    streamer=streamer,
    max_new_tokens=300,    
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

print(f"\nPrompt: {messages[0]['content']}")
print("-" * 30)

t0 = time.time()
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# --- 4. Stream Output ---
generated_text = ""
first_token_received = False
ttft = 0

for new_text in streamer:
    if not first_token_received:
        ttft = time.time() - t0
        first_token_received = True
        print(new_text, end="", flush=True)
    else:
        print(new_text, end="", flush=True)
    generated_text += new_text

t_end = time.time()

# --- 5. Stats ---
total_new_tokens = len(tokenizer.encode(generated_text))
decoding_time = t_end - (t0 + ttft)

print("\n" + "-" * 30)
print(f"Time to First Token: {ttft:.4f} s")
if decoding_time > 0:
    print(f"Generation Speed:    {(total_new_tokens-1)/decoding_time:.2f} tokens/sec")
print(f"Total Tokens:        {total_new_tokens}")

  from .autonotebook import tqdm as notebook_tqdm
/opt/amdgpu/share/libdrm/amdgpu.ids: No such file or directory


✅ GPU Detected: AMD Radeon 8060S

Loading Qwen/Qwen2.5-3B-Instruct...


`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|█████████████████████████████████████████████████████████████| 434/434 [00:01<00:00, 319.40it/s, Materializing param=model.norm.weight]



Prompt: Tell me a short story.
------------------------------


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Once upon a time in a small village nestled among the rolling hills of a verdant land, there was a young girl named Lila who had a heart as bright and pure as the morning sun. She lived with her grandmother in a cozy cottage by the river, where they tended to an orchard filled with apple trees that whispered secrets to those who listened closely.

Lila loved to explore the woods behind their home, always seeking new paths and hidden treasures. One day, while wandering through the dense forest, she stumbled upon a small, forgotten garden. It was overgrown with wildflowers and vines, but as she approached, the plants began to bend and whisper stories in her ear. The garden was alive with magic, and it seemed to call out to her.

Intrigued, Lila decided to tend to the garden, clearing away the debris and nurturing the flowers back to life. As she worked, she discovered a tiny, delicate flower that no one else could see. This flower was unlike any other, its petals shimmering with a light 