## Transformers 


In [1]:
# self attention 
import tensorflow as tf
import numpy as np

# Sample Q, K, V (batch=1, sequence=3 tokens, embedding=4)
Q = tf.constant([[[1.0, 0.0, 1.0, 0.0],
                  [0.0, 2.0, 0.0, 2.0],
                  [1.0, 1.0, 1.0, 1.0]]])  

K = Q  # For self-attention, Q=K=V
V = Q

# Step 1: Scores
scores = tf.matmul(Q, K, transpose_b=True)

# Step 2: Scale
d_k = tf.cast(tf.shape(K)[-1], tf.float32)
scaled_scores = scores / tf.math.sqrt(d_k)

# Step 3: Softmax
weights = tf.nn.softmax(scaled_scores, axis=-1)

# Step 4: Weighted sum
output = tf.matmul(weights, V)

print("Attention Weights:\n", weights.numpy())
print("Output:\n", output.numpy())




Attention Weights:
 [[[0.42231882 0.15536243 0.42231882]
  [0.01587624 0.86681336 0.11731043]
  [0.15536243 0.42231882 0.42231882]]]
Output:
 [[[0.84463763 0.7330437  0.84463763 0.7330437 ]
  [0.13318667 1.8509371  0.13318667 1.8509371 ]
  [0.57768124 1.2669564  0.57768124 1.2669564 ]]]


In [4]:
# multi head attention
from keras.layers import MultiHeadAttention

mha = MultiHeadAttention(num_heads=4, key_dim=64)
query = tf.random.normal((1, 5, 64))  # batch=1, sequence=5, embedding=64
key = query
value = query

output, weights = mha(query, key, value, return_attention_scores=True)
print("Output shape:", output.shape)
print("Attention Weights shape:", weights.shape)


Output shape: (1, 5, 64)
Attention Weights shape: (1, 4, 5, 5)


## position encoding 


In [5]:
import numpy as np
import tensorflow as tf

def positional_encoding(position: int, d_model: int) -> tf.Tensor:
    """
    Generates sinusoidal positional encoding for Transformers.

    Args:
        position (int): Maximum sequence length.
        d_model (int): Embedding dimension size.

    Returns:
        tf.Tensor: Positional encoding tensor of shape (1, position, d_model).
    """
    
    # Create position and dimension indices
    positions = np.arange(position)[:, np.newaxis]  # Shape: (position, 1)
    dims = np.arange(d_model)[np.newaxis, :]        # Shape: (1, d_model)
    
    # Compute angle rates for each position & dimension
    angle_rates = positions / np.power(10000, (2 * (dims // 2)) / np.float32(d_model))
    
    # Apply sin to even indices (2i) and cos to odd indices (2i+1)
    angle_rates[:, 0::2] = np.sin(angle_rates[:, 0::2])  # Even indices
    angle_rates[:, 1::2] = np.cos(angle_rates[:, 1::2])  # Odd indices
    
    # Add batch dimension (1, position, d_model)
    pos_encoding = angle_rates[np.newaxis, ...]
    
    return tf.constant(pos_encoding, dtype=tf.float32)


# Example: Generate positional encoding
pos_encoding = positional_encoding(50, 512)

print(f"Positional Encoding shape: {pos_encoding.shape}")
print(pos_encoding[0, :2, :8])  # Show first 2 positions & first 8 dimensions



Positional Encoding shape: (1, 50, 512)
tf.Tensor(
[[0.         1.         0.         1.         0.         1.
  0.         1.        ]
 [0.84147096 0.5403023  0.8218562  0.569695   0.8019618  0.59737533
  0.7818871  0.62342006]], shape=(2, 8), dtype=float32)


### customer transformer block 

In [10]:
import tensorflow as tf
from keras.layers import LayerNormalization, Dense, Dropout, Embedding
from keras import Model
import numpy as np

# Positional Encoding Function
def positional_encoding(position, d_model):
    positions = np.arange(position)[:, np.newaxis]
    dims = np.arange(d_model)[np.newaxis, :]
    angle_rates = positions / np.power(10000, (2 * (dims // 2)) / np.float32(d_model))
    angle_rates[:, 0::2] = np.sin(angle_rates[:, 0::2])  # Even indices
    angle_rates[:, 1::2] = np.cos(angle_rates[:, 1::2])  # Odd indices
    pos_encoding = angle_rates[np.newaxis, ...]  # Shape: (1, position, d_model)
    return tf.cast(pos_encoding, dtype=tf.float32)

# Transformer Block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)  # Residual + Norm
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)  # Residual + Norm

# Model with Embedding + Positional Encoding + Transformer Block
class SimpleTransformer(Model):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.pos_encoding = positional_encoding(max_len, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)  # Word embeddings
        x += self.pos_encoding[:, :seq_len, :]  # Add positional encoding
        x = self.transformer_block(x)
        return x

# Parameters
vocab_size = 1000
max_len = 20
embed_dim = 32
num_heads = 2
ff_dim = 64

# Create Model
model = SimpleTransformer(vocab_size, max_len, embed_dim, num_heads, ff_dim)

# Dummy Input
dummy_input = tf.constant([[1, 5, 23, 45, 67]])  # Shape: (batch=1, sequence=5)
output = model(dummy_input)

print("Input shape:", dummy_input.shape)
print("Output shape:", output.shape)
print("Output tensor:\n", output)


Input shape: (1, 5)
Output shape: (1, 5, 32)
Output tensor:
 tf.Tensor(
[[[-1.2139783   0.7901611  -0.07223275  1.7702596  -1.1192064
    0.33365262 -1.3610643   1.1406313  -0.49382725  1.993513
   -1.0768187   0.9004701  -1.595818    1.1921027   0.24211189
    0.47187516 -0.02194184  0.49221906 -0.4757451   0.33982074
    0.1217721   0.8884668  -1.0852524   0.54455274 -1.8132877
    0.77975506 -1.3674005   0.9389294  -0.21363592  0.4453123
   -1.4323765  -0.04302042]
  [-0.01251765 -0.4043251   0.9231734   1.5236789  -0.93805635
    0.6337972  -1.073817    1.1407843  -0.6726321   1.8222383
   -1.0959375   1.0190835  -1.893938    1.1799638   0.29763266
    0.39238638 -0.68082273  0.80426776 -0.8170506   0.31224462
   -0.22182408  1.0936482  -1.2231823   0.8265066  -1.6600093
    0.6380167  -1.4227729   1.0076244  -0.31770766  0.44771788
   -1.3445395  -0.28363237]
  [-0.01247142 -2.3561685   1.6658335   0.96436787 -0.51659286
    0.47481412 -0.9704659   0.41194165 -0.26510707  1.596316