In [54]:
import copy
import math
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import types
import importlib

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [27]:
def subsequent_mask(size):
    """Mask out subsequent positions."""
    attn_shape = (1, size, size)
    mask = torch.triu(torch.ones(attn_shape, device=device), diagonal=1).bool()
    return ~mask

In [22]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [23]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, sublayer_fn):
        return x + self.dropout(sublayer_fn(self.norm(x)))

In [32]:
def attention(query, key, value, mask=None, dropout=None):
    """Compute "Scaled Dot Product Attention"""
    d_k = query.size(-1)
    scores = query.matmul(key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(~mask, -1e9)
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [80]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        qs = self.linears[0](query).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
        ks = self.linears[1](key).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
        vs = self.linears[2](value).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
        x, self.attn = attention(qs, ks, vs, mask=mask, dropout=self.dropout)
        x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[3](x)

In [82]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [83]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)


In [84]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model, device=device)
        position = torch.arange(0, max_len, device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, device=device) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [85]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [86]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
    def forward(self, x, memory, src_mask, tgt_mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [87]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [88]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [89]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [90]:
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model, dropout)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab)
    )
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model.to(device)

In [91]:
class LabelSmoothing(nn.Module):
    def __init__(self, size, padding_idx, smoothing=0.1):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
    def forward(self, x, target):
        true_dist = x.data.clone().fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = target.data == self.padding_idx
        if mask.any():
            true_dist[mask] = 0
        return self.criterion(x, true_dist)

In [92]:
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.factor = factor
        self.warmup = warmup
        self.model_size = model_size
    def step(self):
        self._step += 1
        rate = self.factor * (self.model_size ** -0.5 * min(self._step ** -0.5, self._step * self.warmup ** -1.5))
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        return self.optimizer.step()

In [93]:
class SimpleLossCompute:
    def __init__(self, generator, criterion, optimizer=None):
        self.generator = generator
        self.criterion = criterion
        self.optimizer = optimizer
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)) / norm
        if self.optimizer is not None:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
        return loss.item() * norm

In [94]:
class Batch:
    def __init__(self, src, tgt=None, pad=0):
        self.src = src.to(device)
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1].to(device)
            self.tgt_y = tgt[:, 1:].to(device)
            self.tgt_mask = (self.tgt != pad).unsqueeze(-2) & subsequent_mask(self.tgt.size(-1))
            self.ntokens = (self.tgt_y != pad).sum().item()


In [95]:
def data_gen(V, batch_size, nbatches):
    for _ in range(nbatches):
        data = torch.randint(1, V, (batch_size, 10), device=device)
        data[:, 0] = 1
        yield Batch(data, data)

In [96]:
def run_epoch(data_iter, model, loss_compute):
    start = time.time()
    total_loss, total_tokens = 0, 0
    for i, batch in enumerate(data_iter):
        out = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
        loss = loss_compute(out, batch.tgt_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        if i % 50 == 0:
            elapsed = time.time() - start
            print(f"Step {i} | Loss: {loss/batch.ntokens:.4f} | Tokens/sec: {batch.ntokens/elapsed:.2f}")
            start = time.time()
    return total_loss / total_tokens

In [97]:
def greedy_decode(model, src, max_len, start_symbol):
    src_mask = (src != 0).unsqueeze(-2)
    memory = model.encode(src, src_mask)
    ys = torch.ones(src.size(0), 1, dtype=torch.long, device=device) * start_symbol
    for _ in range(max_len - 1):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)))
        prob = F.softmax(model.generator(out[:, -1]), dim=-1)
        next_word = torch.argmax(prob, dim=1).unsqueeze(1)
        ys = torch.cat([ys, next_word], dim=1)
    return ys

In [45]:
class LabelSmoothing(nn.Module):
    def __init__(self, size, padding_idx, smoothing=0.1):
        super().__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
    def forward(self, x, target):
        true_dist = x.data.clone().fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = target.data == self.padding_idx
        if mask.any():
            true_dist[mask] = 0
        return self.criterion(x, true_dist)


In [99]:
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.factor = factor
        self.warmup = warmup
        self.model_size = model_size
    def step(self):
        self._step += 1
        rate = self.factor * (self.model_size ** -0.5 * min(self._step ** -0.5, self._step * self.warmup ** -1.5))
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self.optimizer.step()

In [100]:
class Batch:
    def __init__(self, src, tgt=None, pad=0):
        self.src, self.src_mask = src.to(device), (src!=pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt=tgt[:,:-1].to(device); self.tgt_y=tgt[:,1:].to(device)
            self.tgt_mask=(self.tgt!=pad).unsqueeze(-2)&subsequent_mask(self.tgt.size(1))
            self.ntokens=(self.tgt_y!=pad).sum().item()

In [101]:
import time

In [102]:
def run_epoch(data_iter, model, loss_compute):
    s=time.time(); tl, tt=0,0
    for i,bat in enumerate(data_iter):
        out=model(bat.src, bat.tgt, bat.src_mask, bat.tgt_mask)
        loss=loss_compute(out, bat.tgt_y, bat.ntokens)
        tl+=loss; tt+=bat.ntokens
        if i%50==0:
            e=time.time()-s; print(f"Step {i}| L:{loss/bat.ntokens:.4f}| T/S:{bat.ntokens/e:.2f}"); s=time.time()
    return tl/tt

In [103]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(src.size(0), 1, dtype=torch.long, device=device) * start_symbol
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)))
        prob = F.softmax(model.generator(out[:, -1]), dim=-1)
        next_word = torch.argmax(prob, dim=1).unsqueeze(1)
        ys = torch.cat([ys, next_word], dim=1)
    return ys

In [105]:
def data_gen(V,b,n):
    for _ in range(n):
        d=torch.randint(1,V,(b,10),device=device); d[:,0]=1; yield Batch(data,data)

In [106]:
def run_epoch(data_iter, model, loss_compute):
    start = time.time()
    total_tokens, total_loss = 0, 0
    for batch in data_iter:
        src, tgt = batch
        src_mask = (src != 0).unsqueeze(-2)
        tgt_input = tgt[:, :-1]
        tgt_y = tgt[:, 1:]
        tgt_mask = (tgt_input != 0).unsqueeze(-2) & subsequent_mask(tgt_input.size(-1))
        out = model(src, tgt_input, src_mask, tgt_mask)
        loss = loss_compute(out, tgt_y, (tgt_y != 0).sum().item())
        total_loss += loss
        total_tokens += (tgt_y != 0).sum().item()
    return total_loss / total_tokens

In [107]:
def greedy_decode(model, src, max_len, start_symbol):
    src_mask = (src != 0).unsqueeze(-2)
    memory = model.encode(src, src_mask)
    ys = torch.ones(src.size(0), 1, dtype=torch.long, device=device) * start_symbol
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)))
        prob = F.softmax(model.generator(out[:, -1]), dim=-1)
        next_word = prob.argmax(dim=1).unsqueeze(1)
        ys = torch.cat([ys, next_word], dim=1)
    return ys

In [109]:
def main():
    model=make_model(11,11,2)
    crit=LabelSmoothing(11,0)
    try:
        base=optim.Adam(model.parameters(),lr=0,betas=(0.9,0.98),eps=1e-9)
        opt=NoamOpt(512,1,400,base)
    except Exception as e:
        print(f"Optimizer error: {e}; skipping training.")
        opt=None
    loss_comp=SimpleLossCompute(model.generator, nn.KLDivLoss('sum'), opt)
    if opt:
        for ep in range(2):
            model.train()
            l=run_epoch(data_gen(11,30,20),model,loss_comp)
            print(f"Epoch {ep}|Loss:{l:.4f}")
    model.eval()
    ts=torch.arange(1,11).unsqueeze(0).to(device)
    print("Decoded:",greedy_decode(model,ts,10,1))

if __name__=='__main__':
    main()

Optimizer error: partially initialized module 'torch._inductor' from '/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/_inductor/__init__.py' has no attribute 'custom_graph_pass' (most likely due to a circular import); skipping training.
Decoded: tensor([[1, 9, 6, 8, 7, 2, 4, 4, 4, 4]])




SURVEY:

1. Rate the Guest Speakers.
    Nadia: 8/10
    Jeff: 9/10
    Sam: 10/10
    Fei: 8/10

2. We really enjoyed the presentations and thought that them sharing their real world experiences with AI or where they see it affecting their career in the future was a strength. We thought some more background knowledge on their early careers would've been helpful.

3. We would have liked more guest speakers in the future classes because we felt it was the easiest way to relate to the content. 

4. We think there are more pros to the final project than a final exam as it allows for more time as coding is new to us. Additionally, a project is more creative which fits perfectly with this course. We recommend continuing with this format.