In [1]:
import torch
from usta_model import UstaModel
from usta_tokenizer import UstaTokenizer

u_tokenizer = UstaTokenizer("tokenizer.json")

prompt = "the capital of united"

tokens = u_tokenizer.encode(prompt)
tokens

tensor([ 0, 61,  1, 61,  2, 61,  3])

In [2]:
torch.manual_seed(1)
u_model = UstaModel(vocab_size=len(u_tokenizer.vocab), embedding_dim=4, num_heads=2, context_length=32)

sentence_meanings_with_atention_context = u_model(tokens)
sentence_meanings_with_atention_context.shape

torch.Size([7, 4])

In [3]:
out = u_model(tokens)
out

tensor([[-0.1370, -0.3226,  0.1265,  0.1362],
        [-0.4744, -0.2934, -0.0552,  0.0922],
        [-0.5055, -0.2895, -0.0209,  0.1401],
        [-0.4333, -0.2986, -0.0499,  0.0810],
        [-0.2547, -0.2411,  0.1826,  0.2178],
        [-0.1604, -0.3156,  0.1119,  0.1300],
        [-0.4393, -0.2969, -0.0482,  0.0849]], grad_fn=<AddmmBackward0>)

In [4]:
u_model

UstaModel(
  (embedding): Embedding(64, 4)
  (pos_embedding): Embedding(32, 4)
  (self_attention): UstaMultiHeadAttention(
    (multi_head_attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
    )
    (projection): Linear(in_features=4, out_features=4, bias=True)
  )
  (norm): UstaLayerNorm()
  (mlp): UstaMLP(
    (gate_proj): Linear(in_features=4, out_features=4, bias=True)
    (up_proj): Linear(in_features=4, out_features=4, bias=True)
    (down_proj): Linear(in_features=4, out_features=4, bias=True)
    (gelu): GELU()
  )
)

In [5]:
from usta_layer_norm import UstaLayerNorm

norm_layer = UstaLayerNorm(4)
norm_layer(out)

tensor([[-0.4567, -1.4225,  0.9144,  0.9648],
        [-1.3412, -0.5089,  0.5860,  1.2641],
        [-1.3592, -0.4867,  0.5978,  1.2481],
        [-1.2778, -0.6107,  0.6203,  1.2682],
        [-1.0284, -0.9678,  0.9197,  1.0766],
        [-0.5423, -1.3689,  0.9074,  1.0038],
        [-1.2888, -0.5948,  0.6175,  1.2661]], grad_fn=<MulBackward0>)

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

q_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
q_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")

  from .autonotebook import tqdm as notebook_tqdm


![transformer-architecture](https://deeprevision.github.io/posts/001-transformer/transformer.png)

In [7]:
q_model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

![gelu](https://miro.medium.com/v2/resize:fit:4800/format:webp/1*O5E-huBuY1UTHMmM--rhLQ.png)

In [8]:
import torch.nn as nn

class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (
      1 + torch.tanh(
          torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        )
    )


gelu = GELU()

example_tensor = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
gelu(example_tensor)


tensor([[0.8412, 1.9546, 2.9964],
        [3.9999, 5.0000, 6.0000]])

In [9]:
# import torch funtions 

import torch.nn.functional as F

F.gelu(example_tensor, approximate="tanh")

tensor([[0.8412, 1.9546, 2.9964],
        [3.9999, 5.0000, 6.0000]])

In [10]:
import torch.functional as F

class UstaMLP(nn.Module):
  def __init__(self, embedding_dim, hidden_dim):
    super().__init__()

    self.gate_proj = nn.Linear(embedding_dim, hidden_dim)
    self.up_proj = nn.Linear(embedding_dim, hidden_dim)
    self.down_proj = nn.Linear(hidden_dim, embedding_dim)
    self.gelu = GELU()

  def forward(self, x):
    """ gate = self.gate_proj(x)
        gate = F.gelu(gate, approximate="tanh")
        up = self.up_proj(x)
        fuse = gate * up
        outputs = self.down_proj(fuse) """
    gate = self.gate_proj(x)
    gate = self.gelu(gate)
    up = self.up_proj(x)
    fuse = gate * up
    outputs = self.down_proj(fuse)
    return outputs


In [11]:
import torch.nn as nn
from usta_multi_head_attention import UstaMultiHeadAttention
from usta_layer_norm import UstaLayerNorm
from usta_mlp import UstaMLP

class UstaDecoderBlock(nn.Module):
  def __init__(self, embedding_dim, num_heads, context_length):
    super().__init__()

    self.self_attention = UstaMultiHeadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout_rate=0.5)
    self.norm1 = UstaLayerNorm(embedding_dim)
    self.mlp = UstaMLP(embedding_dim, embedding_dim)
    self.norm2 = UstaLayerNorm(embedding_dim)

  def forward(self, x):
    res = self.norm1(x)

    x = self.self_attention(x)
    x = self.norm1(x)

    x = x + res

    res = self.norm2(x)
    x = self.mlp(x)
    x = self.norm2(x)

    x = x + res

    return x

example_tensor = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.float32)
decoder_block = UstaDecoderBlock(embedding_dim=4, num_heads=2, context_length=3)
decoder_block(example_tensor)


tensor([[-2.6040, -0.9759,  1.6115,  1.9684],
        [-2.6040, -0.9759,  1.6115,  1.9684]], grad_fn=<AddBackward0>)