In [2]:
import torch
 
print(f"PyTorch version {torch.__version__}")
if torch.cuda.is_available():
    print("CUDA GPU")
elif torch.mps.is_available():
    print("Apple Silicon GPU")
else:
    print("Only CPU")  


PyTorch version 2.9.1
Apple Silicon GPU


In [None]:
# Download only the tokenizer for the base Qwen3-small model
  # - kind="base": Downloads the base model (not fine-tuned)
  # - tokenizer_only=True: Only downloads tokenizer files, not model weights
  # - out_dir="qwen3": Saves tokenizer files to ./qwen3/ directory
  
from reasoning_from_scratch.qwen3 import download_qwen3_small
download_qwen3_small(kind="base", tokenizer_only=True, out_dir="qwen3")

In [6]:
#Load the tokenizer settings from the tokenizer file into the Qwen3Tokenizer
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer
 
tokenizer_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)

In [8]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"The input tokens are: {input_token_ids_list}")

The input tokens are: [840, 20772, 3460, 4128, 4119, 13]


In [9]:
text = tokenizer.decode(input_token_ids_list)
print(f"The decoded text is: {text}")

The decoded text is: Explain large language models.


In [10]:
for i in input_token_ids_list:
    print(f"{i} ---> {tokenizer.decode([i])}")

840 ---> Ex
20772 ---> plain
3460 --->  large
4128 --->  language
4119 --->  models
13 ---> .


In [13]:
def get_device(enable_tensor_cores=True):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
        
        if enable_tensor_cores:
            major, minor = map(int, torch.__version__.split(".")[:2])
            if (major, minor) >= (2, 9):
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.conv.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True
 
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple Silicon GPU (MPS)")
 
    elif torch.xpu.is_available():
        device = torch.device("xpu")
        print("Using Intel GPU")
 
    else:
        device = torch.device("cpu")
        print("Using CPU")
 
    return device  
    


In [14]:
device = get_device()

Using Apple Silicon GPU (MPS)


In [15]:
device = torch.device("cpu")

In [16]:
#Download the base Qwen3 0.6B weights
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

qwen3-0.6B-base.pth: 100% (1433 MiB / 1433 MiB)


In [21]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B
 
model_path = Path("qwen3") / "qwen3-0.6B-base.pth"
model = Qwen3Model(QWEN_CONFIG_06_B)  #A
model.load_state_dict(torch.load(model_path))  #B
model.to(device)  #C
#A Instantiate a Qwen3 model with random weights as placeholders
#B Load the pre-trained weights into the model
#C Transfer the model to the designated device (e.g., "cuda")

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [34]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"Number of input tokens: {len(input_token_ids_list)}")

input_tensor = torch.tensor(input_token_ids_list)  
input_tensor_fmt = input_tensor.unsqueeze(0) 
input_tensor_fmt = input_tensor_fmt.to(device)

output_tensor = model(input_tensor_fmt)  
output_tensor_fmt = output_tensor.squeeze(0) 
print(output_tensor_fmt[:])
print(f"Formatted Output tensor shape: {output_tensor_fmt.shape}")
print(tokenizer.decode([output_tensor_fmt[-1].argmax().item()]))

Number of input tokens: 6
tensor([[ 7.4062, 11.4375,  9.2500,  ...,  3.7188,  3.7188,  3.7188],
        [ 9.5000, 10.6250,  7.3125,  ...,  3.2031,  3.2031,  3.2031],
        [10.8750, 10.0625,  7.5000,  ...,  0.1299,  0.1299,  0.1299],
        [ 7.1875,  9.2500,  6.2188,  ..., -2.0469, -2.0469, -2.0469],
        [11.5625, 13.6250, 10.2500,  ...,  1.0469,  1.0469,  1.0469],
        [ 7.3750,  2.0312,  8.0000,  ..., -2.5469, -2.5469, -2.5469]],
       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)
Formatted Output tensor shape: torch.Size([6, 151936])
 Large
