# Learned Positional Encoding — Demo

**Used in:** BERT, GPT-2, and many standard transformer models  
**Formula:** `PE = Embedding(pos_ids)`  — a trainable lookup table  
**Properties:** Task-specific | Fully flexible | Requires data | Fixed max length  
**Best for:** Fixed-length tasks with large training datasets

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
class LearnedPositionalEncoding(nn.Module):
    '''
    Learned PE: a trainable nn.Embedding table.
    One d_model vector per position, optimized end-to-end.
    Initialized with small normal noise (GPT-2 style: std=0.02).
    '''
    def __init__(self, d_model, max_seq_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.pe = nn.Embedding(max_seq_len, d_model)
        nn.init.normal_(self.pe.weight, mean=0.0, std=0.02)  # GPT-style init

        print(f'Learned PE: {max_seq_len} positions x {d_model} dims = {max_seq_len * d_model} params')

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device)
        pe = self.pe(positions).unsqueeze(0)  # (1, seq_len, d_model)
        x = x + pe
        return self.dropout(x)

print('LearnedPositionalEncoding defined.')

In [None]:
# Sanity check
d_model, seq_len, batch = 64, 50, 4
pe_layer = LearnedPositionalEncoding(d_model, max_seq_len=512, dropout=0.0)

x = torch.zeros(batch, seq_len, d_model)
out = pe_layer(x)

print(f'Input shape : {x.shape}')
print(f'Output shape: {out.shape}')
print(f'Learnable params: {sum(p.numel() for p in pe_layer.parameters())}')  # 512 * d_model

# Each position has a unique vector
pe_vecs = pe_layer.pe.weight[:5].detach()  # first 5 positions
similarity = torch.cosine_similarity(pe_vecs[0].unsqueeze(0), pe_vecs, dim=-1)
print(f'\nCosine similarity of pos 0 with pos 0-4: {similarity.tolist()}')
print('(Initialized randomly — similarities should be low, ~0)')

In [None]:
# Heatmap — initial (random) embeddings before training
d_model, seq_len = 64, 60
pe_layer = LearnedPositionalEncoding(d_model, max_seq_len=512, dropout=0.0)

pe_matrix = pe_layer.pe.weight[:seq_len].detach().numpy()  # (seq_len, d_model)

fig, axes = plt.subplots(1, 2, figsize=(16, 4))

im = axes[0].imshow(pe_matrix.T, aspect='auto', cmap='RdYlBu', origin='lower')
axes[0].set_xlabel('Position')
axes[0].set_ylabel('Dimension')
axes[0].set_title('Learned PE (BEFORE training — random init)')
plt.colorbar(im, ax=axes[0])

# Cosine similarity matrix between positions
pe_norm = pe_matrix / (np.linalg.norm(pe_matrix, axis=1, keepdims=True) + 1e-8)
sim_matrix = pe_norm @ pe_norm.T
im2 = axes[1].imshow(sim_matrix, cmap='coolwarm', vmin=-1, vmax=1)
axes[1].set_xlabel('Position')
axes[1].set_ylabel('Position')
axes[1].set_title('Position Similarity Matrix (before training)')
plt.colorbar(im2, ax=axes[1])

plt.tight_layout()
plt.savefig('demo_learned_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: demo_learned_heatmap.png')
print('Note: After training, learned PE will develop meaningful position structure.')

In [None]:
d_model, max_seq_len = 64, 512
print('=== Learned PE Summary ===')
print(f'Learnable parameters: {max_seq_len} x {d_model} = {max_seq_len * d_model}')
print('Initialized: Normal(0, 0.02) — GPT-2 style')
print('Task-adaptive: YES (learned end-to-end)')
print('Generalizes to unseen lengths: NO (fixed max_seq_len)')
print('Used in: BERT, GPT-2')
print()
print('Ready to use in experiments.')
print('Import: from PE.learned_pe import LearnedPositionalEncoding')