# Test Code for Appendix 5: Embedding Architectures

This notebook tests the code listings from `ap5_embeddings.md` to ensure they are functional and consistent with the text.

## 1. Fundamentals of Neural Embeddings

Testing the basic embedding lookup example from Section 1.2.

In [1]:
import torch
import torch.nn as nn

# Define embedding table: 10,000 product IDs → 128-d vectors
product_embedding = nn.Embedding(num_embeddings=10000, embedding_dim=128)

# Input: batch of product IDs
product_ids = torch.tensor([42, 156, 7890, 42])  # shape: (4,)

# Lookup embeddings
embeddings = product_embedding(product_ids)  # shape: (4, 128)

print(f"Embedding for Product 42: {embeddings[0][:5]}...")  # First 5 dims
# Output: tensor([-0.1234,  0.5678, -0.9012,  0.3456, -0.7890], grad_fn=...)

Embedding for Product 42: tensor([-1.0160, -1.3530, -0.4644,  0.8269, -0.0637], grad_fn=<SliceBackward0>)...


## 2. Sequential Embeddings for Text

Testing Code Listing A5.1: Query Encoder with Positional Embeddings from Section 2.5.

In [2]:
import torch
import torch.nn as nn

class QueryEncoder(nn.Module):
    def __init__(self, vocab_size=30000, embed_dim=128, max_seq_len=64, num_heads=4, num_layers=4):
        super().__init__()
        # Token embeddings (learned)
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Positional embeddings (learned)
        self.pos_embedding = nn.Embedding(max_seq_len, embed_dim)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            dim_feedforward=embed_dim*4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
    def forward(self, token_ids):
        """
        Args:
            token_ids: (batch_size, seq_len) - tokenized queries
        Returns:
            query_embedding: (batch_size, embed_dim) - fixed-size query vectors
        """
        batch_size, seq_len = token_ids.shape
        
        # Token embeddings: (batch_size, seq_len, embed_dim)
        token_embeds = self.token_embedding(token_ids)
        
        # Positional embeddings: (seq_len, embed_dim)
        positions = torch.arange(seq_len, device=token_ids.device)
        pos_embeds = self.pos_embedding(positions).unsqueeze(0)  # (1, seq_len, embed_dim)
        
        # Combine: (batch_size, seq_len, embed_dim)
        input_embeds = token_embeds + pos_embeds
        
        # Create padding mask (0 = valid token, 1 = padding)
        # Assumes token_id=0 is padding token
        padding_mask = (token_ids == 0)  # (batch_size, seq_len)
        
        # Transformer encoding: (batch_size, seq_len, embed_dim)
        contextualized = self.transformer(input_embeds, src_key_padding_mask=padding_mask)
        
        # Average pooling (excluding padding tokens): (batch_size, embed_dim)
        mask_expanded = (~padding_mask).unsqueeze(-1).float()  # (batch_size, seq_len, 1)
        masked_contextualized = contextualized * mask_expanded
        query_embedding = masked_contextualized.sum(dim=1) / mask_expanded.sum(dim=1).clamp(min=1)
        
        return query_embedding

# Example usage
encoder = QueryEncoder(vocab_size=30000, embed_dim=128)
query_tokens = torch.tensor([[1523, 4201, 5678, 89, 9432]])  # "best family hotel in Orlando"
query_embed = encoder(query_tokens)
print(f"Query embedding shape: {query_embed.shape}")  # torch.Size([1, 128])

Query embedding shape: torch.Size([1, 128])


## 3. Sequential Embeddings for Multi-Dimensional Behavioral Actions

Testing Code Listing A5.2: Behavioral Sequence Encoder with Temporal Decay from Section 3.6.

In [3]:
import torch
import torch.nn as nn

class BehavioralSequenceEncoder(nn.Module):
    """
    Encodes user behavioral sequences with temporal decay.
    Input: sequence of (action_id, item_id, context_id, timestamp) tuples
    Output: fixed-size behavioral embedding vector
    """
    def __init__(self, 
                 action_vocab_size=50,
                 item_vocab_size=1_000_000,
                 context_vocab_size=500,
                 action_emb_dim=32,
                 item_emb_dim=64,
                 context_emb_dim=32,
                 pos_emb_dim=32,
                 hidden_dim=256,
                 decay_rate=0.02):
        super().__init__()
        
        # Embedding tables
        self.action_emb = nn.Embedding(action_vocab_size, action_emb_dim, padding_idx=0)
        self.item_emb = nn.Embedding(item_vocab_size, item_emb_dim, padding_idx=0)
        self.context_emb = nn.Embedding(context_vocab_size, context_emb_dim, padding_idx=0)
        self.pos_emb = nn.Embedding(50, pos_emb_dim)  # max sequence length 50
        
        # Temporal decay
        self.decay_rate = decay_rate
        
        # Concatenated dimension: 32 + 64 + 32 + 32 = 160
        concat_dim = action_emb_dim + item_emb_dim + context_emb_dim + pos_emb_dim
        
        # DIN-style attention network
        self.attention_net = nn.Sequential(
            nn.Linear(concat_dim * 3, 128),  # [action, ad, action*ad]
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
        # Ad embedding projection to match action vector dimension
        self.ad_proj = nn.Linear(hidden_dim, concat_dim)
        
        # Final projection
        self.output_proj = nn.Linear(concat_dim, hidden_dim)
        
    def forward(self, action_ids, item_ids, context_ids, time_deltas, candidate_ad_emb):
        """
        Args:
            action_ids: [batch_size, seq_len] - action type tokens
            item_ids: [batch_size, seq_len] - item ID tokens
            context_ids: [batch_size, seq_len] - context tokens
            time_deltas: [batch_size, seq_len] - seconds since current request
            candidate_ad_emb: [batch_size, hidden_dim] - current ad embedding
        Returns:
            behavioral_emb: [batch_size, hidden_dim] - user intent embedding
        """
        batch_size, seq_len = action_ids.shape
        
        # Lookup embeddings
        action_emb = self.action_emb(action_ids)  # [B, L, 32]
        item_emb = self.item_emb(item_ids)        # [B, L, 64]
        context_emb = self.context_emb(context_ids)  # [B, L, 32]
        pos_emb = self.pos_emb(torch.arange(seq_len, device=action_ids.device))  # [L, 32]
        pos_emb = pos_emb.unsqueeze(0).expand(batch_size, -1, -1)  # [B, L, 32]
        
        # Concatenate all components
        action_vec = torch.cat([action_emb, item_emb, context_emb, pos_emb], dim=-1)  # [B, L, 160]
        
        # Compute temporal decay weights
        temporal_weights = torch.exp(-self.decay_rate * time_deltas)  # [B, L]
        temporal_weights = temporal_weights.unsqueeze(-1)  # [B, L, 1]
        
        # DIN attention: compare each action to candidate ad
        ad_emb_expanded = candidate_ad_emb.unsqueeze(1).expand(-1, seq_len, -1)  # [B, L, hidden_dim]
        
        # Project ad embedding to match action_vec dimension
        ad_proj = self.ad_proj(ad_emb_expanded)  # [B, L, 160]
        
        # Attention input: [action, ad, action*ad]
        # Why this design? Element-wise product captures feature interactions (e.g., 
        # "user clicked laptops" × "current ad is laptop" → high similarity signal)
        attn_input = torch.cat([
            action_vec, 
            ad_proj, 
            action_vec * ad_proj
        ], dim=-1)  # [B, L, 480]
        
        # Compute attention scores
        attn_scores = self.attention_net(attn_input).squeeze(-1)  # [B, L]
        
        # Softmax over sequence to get base attention weights
        attn_weights = torch.softmax(attn_scores, dim=1).unsqueeze(-1)  # [B, L, 1]
        
        # Apply temporal decay to attention weights
        # Recent actions keep full weight, older actions get dampened
        attn_weights = attn_weights * temporal_weights
        
        # Weighted sum
        behavioral_vec = (action_vec * attn_weights).sum(dim=1)  # [B, 160]
        
        # Final projection
        behavioral_emb = self.output_proj(behavioral_vec)  # [B, 256]
        
        return behavioral_emb

# Test the BehavioralSequenceEncoder
batch_size = 4
seq_len = 10
hidden_dim = 256

# Create dummy inputs
action_ids = torch.randint(1, 50, (batch_size, seq_len))
item_ids = torch.randint(1, 1000, (batch_size, seq_len))
context_ids = torch.randint(1, 500, (batch_size, seq_len))
time_deltas = torch.rand((batch_size, seq_len)) * 100  # 0 to 100 seconds
candidate_ad_emb = torch.randn((batch_size, hidden_dim))

# Initialize model
model = BehavioralSequenceEncoder(
    action_vocab_size=50,
    item_vocab_size=1000,
    context_vocab_size=500,
    hidden_dim=hidden_dim
)

# Forward pass
output = model(action_ids, item_ids, context_ids, time_deltas, candidate_ad_emb)
print(f"Behavioral embedding shape: {output.shape}")  # Should be [4, 256]

Behavioral embedding shape: torch.Size([4, 256])


## 4. Rigorous Testing

We will now perform deeper validation to ensure the specific logic (Padding, Temporal Decay, DIN Attention) works as intended.

In [4]:
print("\n--- Test 1: Padding Masking Verification ---")
# Create two inputs: one with padding, one without (but same valid tokens)
# Note: To compare exactly, we need to handle the fact that QueryEncoder 
# expects a batch. We'll compare [A, B] vs [A, B, PAD]
# But wait, the model has learned positional embeddings. 
# [A, B] has positions 0, 1.
# [A, B, PAD] has positions 0, 1, 2.
# If masking works, A and B attend only to 0, 1. PAD is ignored.
# So A and B states should be identical.
# Pooling sums A and B and divides by 2.
# So the final vector should be identical.

vocab_size = 30000
encoder = QueryEncoder(vocab_size=vocab_size, embed_dim=128)
encoder.eval() # Set to eval mode

# Token IDs
token_A = 100
token_B = 200
pad_token = 0

# Input 1: [A, B] padded to length 3 -> [A, B, PAD]
input_padded = torch.tensor([[token_A, token_B, pad_token]])

# Input 2: [A, B] (length 2)
input_clean = torch.tensor([[token_A, token_B]])

with torch.no_grad():
    out_padded = encoder(input_padded)
    out_clean = encoder(input_clean)

# Check difference
diff = (out_padded - out_clean).abs().max().item()
print(f"Difference between [A, B] and [A, B, PAD]: {diff:.6f}")
if diff < 1e-5:
    print("SUCCESS: Padding is correctly masked out.")
else:
    print("FAILURE: Padding affects the result!")


--- Test 1: Padding Masking Verification ---
Difference between [A, B] and [A, B, PAD]: 0.000000
SUCCESS: Padding is correctly masked out.


  output = torch._nested_tensor_from_mask(


In [5]:
print("\n--- Test 2: Temporal Decay & DIN Attention Verification ---")
# Initialize model
hidden_dim = 256
model = BehavioralSequenceEncoder(hidden_dim=hidden_dim)
model.eval()

batch_size = 1
seq_len = 2

# Fixed inputs
action_ids = torch.tensor([[1, 2]]) # Action 1, Action 2
item_ids = torch.tensor([[10, 20]])
context_ids = torch.tensor([[5, 5]])

# Case A: No time decay (both recent)
time_recent = torch.tensor([[0.0, 0.0]])

# Case B: Action 1 is old, Action 2 is recent
time_decay = torch.tensor([[100.0, 0.0]]) # 100s gap

# Candidate Ad (random)
candidate_ad = torch.randn(batch_size, hidden_dim)

with torch.no_grad():
    out_recent = model(action_ids, item_ids, context_ids, time_recent, candidate_ad)
    out_decay = model(action_ids, item_ids, context_ids, time_decay, candidate_ad)

# Check if time affects output
time_diff = (out_recent - out_decay).abs().max().item()
print(f"Effect of temporal decay (0s vs 100s): {time_diff:.6f}")
if time_diff > 1e-4:
    print("SUCCESS: Temporal decay changes the output embedding.")
else:
    print("FAILURE: Temporal decay has no effect.")

print("\n--- Test 3: DIN Attention Verification ---")
# Check if changing the candidate ad changes the output (Attention should shift)
candidate_ad_2 = torch.randn(batch_size, hidden_dim)

with torch.no_grad():
    out_ad1 = model(action_ids, item_ids, context_ids, time_recent, candidate_ad)
    out_ad2 = model(action_ids, item_ids, context_ids, time_recent, candidate_ad_2)

ad_diff = (out_ad1 - out_ad2).abs().max().item()
print(f"Effect of changing candidate ad: {ad_diff:.6f}")
if ad_diff > 1e-4:
    print("SUCCESS: DIN Attention adapts to candidate ad.")
else:
    print("FAILURE: Candidate ad has no effect (Attention might be broken).")


--- Test 2: Temporal Decay & DIN Attention Verification ---
Effect of temporal decay (0s vs 100s): 0.708817
SUCCESS: Temporal decay changes the output embedding.

--- Test 3: DIN Attention Verification ---
Effect of changing candidate ad: 0.050368
SUCCESS: DIN Attention adapts to candidate ad.
