# Updated Annotated Transformer

This notebook is the fully updated version of the Harvard Annotated Transformer. All code cells have been modernized to work with recent PyTorch releases.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


In [3]:
def attention(query, key, value, mask=None, dropout=None):
    """Scaled Dot-Product Attention"""
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


In [5]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        """Take in model size and number of heads."""
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(4)])
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k  
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch  
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear  
        x = x.transpose(1, 2).contiguous()              .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)


## Survey Responses

**Guest Speakers Ratings (avg):**
- Nadia: 8/10
- Jeff: 9/10
- Sam: 7/10
- Fei: 10/10

**Thoughts on the presentations:** The speakers each brought unique expertise; Nadia’s insights into practical NLP applications were strong, Jeff’s deep dives into architecture design were thorough, Sam’s coverage of emerging research felt slightly rushed, and Fei’s synthesis of theory and practice was exemplary.

**More guest speakers?** Yes—additional voices from industry practitioners (e.g., AI ethics, deployment engineers) would add valuable perspective.

**Final project vs. traditional exam:**
- **Pros:** Encourages hands‑on learning, reflects real-world workflows, fosters creativity.
- **Cons:** Harder to standardize grading, project scope can vary, may disadvantage students less familiar with tooling.

**Recommendation:** Continue the final‑project format, with clear rubrics to ensure consistency.