In [4]:
import numpy as np

vocab_size = 88
seq_len = 8
d_model = 64

In [5]:
# random token embeddings: (vocab_size, d_model)
token_embedding = np.random.randn(vocab_size, d_model) * 0.01

# sinusoidal positional encoding: (seq_len, d_model)
def get_positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    # apply sin to even indices, cos to odd indices
    pos_encoding = np.zeros((seq_len, d_model))
    pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return pos_encoding

positional_encoding = get_positional_encoding(seq_len, d_model)

In [6]:
# initialize projection matrices
W_q = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_k = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_v = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_o = np.random.randn(d_model, d_model) / np.sqrt(d_model)

def self_attention(x):
    # x shape: (seq_len, d_model)
    Q = x @ W_q
    K = x @ W_k
    V = x @ W_v

    scores = Q @ K.T / np.sqrt(d_model)         # (seq_len, seq_len)
    mask = np.triu(np.ones_like(scores), 1) * -1e9  # causal mask
    scores += mask

    weights = softmax(scores)                   # (seq_len, seq_len)
    return weights @ V @ W_o                    # (seq_len, d_model)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)


In [7]:
# layernorm parameters
gamma = np.ones((d_model,))
beta = np.zeros((d_model,))

def layernorm(x, eps=1e-5):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    return gamma * (x - mean) / np.sqrt(var + eps) + beta

# feedforward layer
W1 = np.random.randn(d_model, d_model * 4) / np.sqrt(d_model)
b1 = np.zeros((d_model * 4,))
W2 = np.random.randn(d_model * 4, d_model) / np.sqrt(d_model * 4)
b2 = np.zeros((d_model,))

def feedforward(x):
    return np.maximum(0, x @ W1 + b1) @ W2 + b2


In [8]:
def transformer_block(x):
    # input: (seq_len, d_model)
    attn = self_attention(x)
    x = layernorm(x + attn)       # residual + norm

    ff = feedforward(x)
    x = layernorm(x + ff)         # another residual + norm
    return x


In [9]:
def forward(tokens):
    # tokens: list of ints, e.g. [60, 62, 64]
    x = token_embedding[tokens] + positional_encoding[:len(tokens)]
    x = transformer_block(x)
    return x


In [10]:
# output head (shared with token_embedding is more efficient, but we keep it simple)
W_out = np.random.randn(d_model, vocab_size) / np.sqrt(d_model)

In [15]:
generated = generate_sequence([60], max_len=16)
print("Generated sequence:", generated)

Generated sequence: [60, 13, 8, 8, 67, 67, 67, 39, 39, 39, 39, 39, 39, 39, 39, 39]


In [17]:
def sample_from_probs(probs, temperature=1.0):
    """
    Sample an index from a probability distribution with optional temperature.

    Args:
        probs: np.ndarray – shape (vocab_size,)
        temperature: float – >1.0 = more random, <1.0 = more deterministic

    Returns:
        int – sampled index
    """
    if temperature != 1.0:
        probs = np.log(probs + 1e-9) / temperature
        probs = np.exp(probs - np.max(probs))
        probs = probs / np.sum(probs)
    return int(np.random.choice(len(probs), p=probs))


def predict_next_note(x, temperature=1.0):
    logits = x[-1] @ W_out
    probs = softmax(logits)
    return sample_from_probs(probs, temperature)

In [18]:
def generate_sequence(start_seq, max_len=32, temperature=1.0):
    seq = list(start_seq)
    while len(seq) < max_len:
        input_tokens = seq[-seq_len:]
        x = forward(input_tokens)
        next_note = predict_next_note(x, temperature)
        seq.append(next_note)
    return seq

In [24]:
generated = generate_sequence([60], max_len=16, temperature=1)
print("Generated:", generated)

Generated: [60, 80, 72, 5, 73, 0, 56, 37, 42, 44, 14, 69, 72, 69, 69, 84]


In [25]:
def make_training_data():
    data = []
    for start in range(vocab_size - seq_len + 1):
        seq = list(range(start, start + seq_len))
        data.append(seq)
    return np.array(data)

def cross_entropy(pred, target_idx):
    """
    pred: shape (vocab_size,) – probabilities
    target_idx: int – ground-truth token
    """
    return -np.log(pred[target_idx] + 1e-9)


In [33]:
# reset everything
token_embedding = np.random.randn(vocab_size, d_model) * 0.01
W_out = np.random.randn(d_model, vocab_size) / np.sqrt(d_model)

def forward_simple(tokens):
    # mean embedding of the tokens
    x = token_embedding[tokens].mean(axis=0)  # shape: (d_model,)
    return x

def train_simple(num_epochs=1000, lr=0.1):
    data = make_training_data()
    global token_embedding, W_out

    for epoch in range(num_epochs):
        total_loss = 0
        for seq in data:
            x_tokens = seq[:-1]
            y_target = seq[-1]

            # forward
            x = forward_simple(x_tokens)
            logits = x @ W_out
            probs = softmax(logits)
            loss = cross_entropy(probs, y_target)
            total_loss += loss

            # backward
            dlogits = probs
            dlogits[y_target] -= 1  # ∇L

            dW_out = np.outer(x, dlogits)
            dx = dlogits @ W_out.T
            d_embed = dx / len(x_tokens)  # distribute equally

            # update
            W_out -= lr * dW_out
            for idx in x_tokens:
                token_embedding[idx] -= lr * d_embed
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1:03}: loss = {total_loss / len(data):.4f}")


In [39]:
train_simple(num_epochs=2000, lr=0.003)

# generate and check output
print(generate_sequence([40], max_len=16, temperature=0.01))


Epoch 001: loss = 0.0009
Epoch 011: loss = 0.0009
Epoch 021: loss = 0.0009
Epoch 031: loss = 0.0009
Epoch 041: loss = 0.0009
Epoch 051: loss = 0.0009
Epoch 061: loss = 0.0009
Epoch 071: loss = 0.0009
Epoch 081: loss = 0.0009
Epoch 091: loss = 0.0009
Epoch 101: loss = 0.0009
Epoch 111: loss = 0.0009
Epoch 121: loss = 0.0009
Epoch 131: loss = 0.0009
Epoch 141: loss = 0.0009
Epoch 151: loss = 0.0009
Epoch 161: loss = 0.0009
Epoch 171: loss = 0.0009
Epoch 181: loss = 0.0009
Epoch 191: loss = 0.0009
Epoch 201: loss = 0.0009
Epoch 211: loss = 0.0009
Epoch 221: loss = 0.0009
Epoch 231: loss = 0.0009
Epoch 241: loss = 0.0009
Epoch 251: loss = 0.0009
Epoch 261: loss = 0.0009
Epoch 271: loss = 0.0009
Epoch 281: loss = 0.0009
Epoch 291: loss = 0.0009
Epoch 301: loss = 0.0009
Epoch 311: loss = 0.0009
Epoch 321: loss = 0.0009
Epoch 331: loss = 0.0009
Epoch 341: loss = 0.0009
Epoch 351: loss = 0.0009
Epoch 361: loss = 0.0009
Epoch 371: loss = 0.0009
Epoch 381: loss = 0.0009
Epoch 391: loss = 0.0009


In [40]:
print(generate_sequence([0,1,2,3,4,5,6], max_len=16, temperature=0.5))


[0, 1, 2, 3, 4, 5, 6, 13, 20, 9, 71, 78, 71, 78, 71, 78]
