#Project Rule:
**This project measures reasoning ability, not language fluency.**

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


# **Chosen Task: SCAN – compositional command understanding**
**Evaluation: Accuracy on unseen command compositions**

# Success Criteria
# Model size ≤ 10M parameters
# Beats a 50–100M baseline on SCAN generalization split
# No chain-of-thought output
# Accuracy is the only metric

In [2]:
!pip install torch numpy tqdm



In [47]:
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


In [3]:
import os

folders = [
    "data",
    "models",
    "training",
    "evaluation",
    "experiments"
]

for f in folders:
    os.makedirs(f, exist_ok=True)

print("Project structure ready.")


Project structure ready.


SCAN is:

Input: command string

"jump around right twice"


Output: action sequence

JUMP RTURN JUMP RTURN

Decisions (write this in a Markdown cell):

Token-level sequence-to-sequence

Teacher forcing

Cross-entropy loss

Exact-match accuracy

In [4]:
# STEP 9: Vocabulary setup

from collections import Counter

def build_vocab(sequences):
    counter = Counter()
    for seq in sequences:
        counter.update(seq.split())
    vocab = {tok: i+2 for i, tok in enumerate(counter.keys())}
    vocab["<pad>"] = 0
    vocab["<eos>"] = 1
    return vocab


In [5]:
def encode(seq, vocab):
    return [vocab[token] for token in seq.split()] + [vocab["<eos>"]]


In [6]:
import torch
from torch.utils.data import Dataset

class ScanDataset(Dataset):
    def __init__(self, inputs, outputs, in_vocab, out_vocab):
        self.inputs = inputs
        self.outputs = outputs
        self.in_vocab = in_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        x = torch.tensor(encode(self.inputs[idx], self.in_vocab))
        y = torch.tensor(encode(self.outputs[idx], self.out_vocab))
        return x, y


In [44]:
import torch
import torch.nn as nn

class ReasoningBottleneck(nn.Module):
    def __init__(self, d_model, n_slots=4, n_heads=4):
        super().__init__()
        self.n_slots = n_slots

        self.slots = nn.Parameter(torch.randn(1, n_slots, d_model))

        self.attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            batch_first=True
        )

        # ✅ Stabilizer
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        # x: (B, T, D)
        B = x.size(0)

        # Expand slots across batch
        slots = self.slots.expand(B, -1, -1)  # (B, n_slots, D)

        # Slots attend to tokens
        slots, _ = self.attn(
            query=slots,
            key=x,
            value=x
        )

        # ✅ Stabilize representations
        slots = self.norm(slots)

        return slots  # (B, n_slots, D)


In [49]:
class TinyTransformer(nn.Module):
    def __init__(self, vocab_in, vocab_out, d_model=128, n_heads=4, n_layers=2):
        super().__init__()

        self.embed_in = nn.Embedding(vocab_in, d_model)
        self.embed_out = nn.Embedding(vocab_out, d_model)

        self.pos_enc = nn.Parameter(torch.zeros(1, 512, d_model))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            batch_first=True
        )

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=n_heads,
            batch_first=True
        )

        self.encoder = nn.TransformerEncoder(encoder_layer, n_layers)
        self.reasoning = ReasoningBottleneck(d_model=d_model, n_slots=6)
        self.decoder = nn.TransformerDecoder(decoder_layer, n_layers)

        self.output = nn.Linear(d_model, vocab_out)

    def forward(self, src, tgt):
        src = self.embed_in(src) + self.pos_enc[:, :src.size(1)]
        tgt = self.embed_out(tgt) + self.pos_enc[:, :tgt.size(1)]

        encoded = self.encoder(src)
        memory = self.reasoning(encoded)

        out = self.decoder(tgt, memory)
        return self.output(out)


In [8]:
def train_step(model, batch, optimizer, criterion):
    model.train()
    src, tgt = batch
    tgt_input = tgt[:, :-1]
    tgt_output = tgt[:, 1:]

    logits = model(src, tgt_input)
    loss = criterion(
        logits.reshape(-1, logits.size(-1)),
        tgt_output.reshape(-1)
    )

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()


In [9]:
!git clone https://github.com/brendenlake/SCAN.git


Cloning into 'SCAN'...
remote: Enumerating objects: 205, done.[K
remote: Total 205 (delta 0), reused 0 (delta 0), pack-reused 205 (from 1)[K
Receiving objects: 100% (205/205), 11.10 MiB | 14.88 MiB/s, done.
Resolving deltas: 100% (173/173), done.
Updating files: 100% (212/212), done.


In [13]:
def load_scan(path):
    inputs, outputs = [], []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                inp, out = line.split(" OUT: ")
                inp = inp.replace("IN: ", "")
                inputs.append(inp)
                outputs.append(out)
    return inputs, outputs


train_inputs, train_outputs = load_scan(
    "SCAN/simple_split/tasks_train_simple.txt"
)

test_inputs, test_outputs = load_scan(
    "SCAN/length_split/tasks_test_length.txt"
)

print(len(train_inputs), len(test_inputs))

16728 3920


In [14]:
in_vocab = build_vocab(train_inputs)
out_vocab = build_vocab(train_outputs)

print(len(in_vocab), len(out_vocab))


15 8


In [15]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    xs, ys = zip(*batch)
    xs = pad_sequence(xs, batch_first=True, padding_value=0)
    ys = pad_sequence(ys, batch_first=True, padding_value=0)
    return xs.cuda(), ys.cuda()


In [16]:
from torch.utils.data import DataLoader

train_dataset = ScanDataset(
    train_inputs, train_outputs, in_vocab, out_vocab
)

test_dataset = ScanDataset(
    test_inputs, test_outputs, in_vocab, out_vocab
)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)


In [17]:
device = "cuda"

model = TinyTransformer(
    vocab_in=len(in_vocab),
    vocab_out=len(out_vocab)
).to(device)


In [18]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)


In [45]:
for epoch in range(10):
    total_loss = 0

    for batch in train_loader:
        loss = train_step(model, batch, optimizer, criterion)
        total_loss += loss

    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")


Epoch 1 | Loss: 0.0016
Epoch 2 | Loss: 0.0017
Epoch 3 | Loss: 0.0008
Epoch 4 | Loss: 0.0014
Epoch 5 | Loss: 0.0009
Epoch 6 | Loss: 0.0004
Epoch 7 | Loss: 0.0024
Epoch 8 | Loss: 0.0019
Epoch 9 | Loss: 0.0007
Epoch 10 | Loss: 0.0016


In [46]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for src, tgt in loader:
            tgt_input = tgt[:, :-1]
            logits = model(src, tgt_input)
            preds = logits.argmax(-1)

            for i in range(preds.size(0)):
                if torch.equal(preds[i], tgt[i, 1:preds.size(1)+1]):
                    correct += 1
                total += 1

    return correct / total


acc = evaluate(model, test_loader)
print("Test Accuracy:", acc)


Test Accuracy: 0.4813775510204082


In [41]:
acc = evaluate(model, test_loader)
print("Test Accuracy:", acc)


Test Accuracy: 0.42423469387755103


In [None]:
seeds = [1, 2, 3, 4, 5]
results = []

for seed in seeds:
    print(f"\nRunning seed {seed}")
    set_seed(seed)

    model = TinyTransformer(
        vocab_in=len(in_vocab),
        vocab_out=len(out_vocab)
    ).cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(10):
        for batch in train_loader:
            train_step(model, batch, optimizer, criterion)

    acc = evaluate(model, test_loader)
    results.append(acc)

    print(f"Seed {seed} → Test Acc = {acc:.4f}")

print("\nFinal Results:")
print("Mean:", sum(results) / len(results))
print("Std:", torch.tensor(results).std().item())



Running seed 1
