In [38]:
import torch
 
print(f"PyTorch version {torch.__version__}")
if torch.cuda.is_available():
    print("CUDA GPU")
elif torch.mps.is_available():
    print("Apple Silicon GPU")
else:
    print("Only CPU")  


PyTorch version 2.9.1
Apple Silicon GPU


In [39]:
# Download only the tokenizer for the base Qwen3-small model
  # - kind="base": Downloads the base model (not fine-tuned)
  # - tokenizer_only=True: Only downloads tokenizer files, not model weights
  # - out_dir="qwen3": Saves tokenizer files to ./qwen3/ directory
  
from reasoning_from_scratch.qwen3 import download_qwen3_small
download_qwen3_small(kind="base", tokenizer_only=True, out_dir="qwen3")

In [40]:
#Load the tokenizer settings from the tokenizer file into the Qwen3Tokenizer
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer
 
tokenizer_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)

In [41]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"The input tokens are: {input_token_ids_list}")

The input tokens are: [840, 20772, 3460, 4128, 4119, 13]


In [42]:
text = tokenizer.decode(input_token_ids_list)
print(f"The decoded text is: {text}")

The decoded text is: Explain large language models.


In [43]:
for i in input_token_ids_list:
    print(f"{i} ---> {tokenizer.decode([i])}")

840 ---> Ex
20772 ---> plain
3460 --->  large
4128 --->  language
4119 --->  models
13 ---> .


In [44]:
def get_device(enable_tensor_cores=True):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
        
        if enable_tensor_cores:
            major, minor = map(int, torch.__version__.split(".")[:2])
            if (major, minor) >= (2, 9):
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.conv.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True
 
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple Silicon GPU (MPS)")
 
    elif torch.xpu.is_available():
        device = torch.device("xpu")
        print("Using Intel GPU")
 
    else:
        device = torch.device("cpu")
        print("Using CPU")
 
    return device  
    


In [45]:
device = get_device()

Using Apple Silicon GPU (MPS)


In [46]:
#device = torch.device("cpu")

In [47]:
#Download the base Qwen3 0.6B weights
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

✓ qwen3/qwen3-0.6B-base.pth already up-to-date


In [48]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B
 
model_path = Path("qwen3") / "qwen3-0.6B-base.pth"
model = Qwen3Model(QWEN_CONFIG_06_B)  #A
model.load_state_dict(torch.load(model_path))  #B
model.to(device)  #C
#A Instantiate a Qwen3 model with random weights as placeholders
#B Load the pre-trained weights into the model
#C Transfer the model to the designated device (e.g., "cuda")

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [56]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"Number of input tokens: {len(input_token_ids_list)}")

input_tensor = torch.tensor(input_token_ids_list)  
input_tensor_fmt = input_tensor.unsqueeze(0) 
input_tensor_fmt = input_tensor_fmt.to(device)

output_tensor = model(input_tensor_fmt)  
output_tensor_fmt = output_tensor.squeeze(0) 
print(output_tensor_fmt[:])
print(f"Formatted Output tensor shape: {output_tensor_fmt.shape}")
print(tokenizer.decode([output_tensor_fmt[-1].argmax().item()]))

Number of input tokens: 6
tensor([[ 7.4062, 11.4375,  9.2500,  ...,  3.7188,  3.7188,  3.7188],
        [ 9.3750, 10.5625,  7.2500,  ...,  3.2344,  3.2344,  3.2344],
        [10.8750, 10.0000,  7.5938,  ...,  0.1992,  0.1992,  0.1992],
        [ 7.1250,  9.2500,  6.2812,  ..., -1.9844, -1.9844, -1.9844],
        [11.5000, 13.6250, 10.2500,  ...,  1.0000,  1.0000,  1.0000],
        [ 7.3438,  2.0312,  7.9375,  ..., -2.5156, -2.5156, -2.5156]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<SliceBackward0>)
Formatted Output tensor shape: torch.Size([6, 151936])
 Large


In [61]:
torch.inference_mode()                                        #A
def generate_text_basic(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id=tokenizer.eos_token_id 
):
    input_length = token_ids.shape[1]
    model.eval()                                               #B
 
    for _ in range(max_new_tokens):
        out = model(token_ids)[:, -1]                          #C
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None                           #D
                and next_token.item() == eos_token_id):
            break
 
        token_ids = torch.cat(                                 #E
            [token_ids, next_token], dim=1)                    #E
    return token_ids[:, input_length:]                         #F

#A Disable gradient tracking for speed and memory efficiency
#B Switch model to evaluation mode to enable deterministic behavior (best practice)
#C Get the scores of the last token
#D Stop if all sequences in the batch have generated EOS
#E Append the newly predicted token to the sequence
#F Return only the generated tokens (excluding the original input)”



In [None]:
Prompt = "Explain large language models in a single sentence."
input_token_ids_tensor = torch.tensor(
    tokenizer.encode(prompt),
    device=device                            #A
    ).unsqueeze(0)
 
max_new_tokens = 200                       #B
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
)
output_text = tokenizer.decode(
    output_token_ids_tensor.squeeze(0).tolist()     #C
)
print(output_text) 
    
     #A Transfer the input token IDs onto the same device (CPU, GPU) where the model is located
     #B Let the model generate up to 100 new tokens
     #C Convert output token IDs from PyTorch tens”



KeyboardInterrupt: 

In [60]:
print(tokenizer.encode("<|endoftext|>"))

[151643]
