### Tokenisation

In [1]:
import numpy as np

# --- 1. Tokenization ---
# In a real model, this is a complex tokenizer. We'll fake it.
sentence = "the cat sat on the mat"
tokens = sentence.split(' ')
print(f"Tokens: {tokens}")

Tokens: ['the', 'cat', 'sat', 'on', 'the', 'mat']


In [2]:
# Create a vocabulary: a map from token to a unique ID
vocab = {token: i for i, token in enumerate(set(tokens))}
print(f"Vocabulary: {vocab}")

Vocabulary: {'on': 0, 'sat': 1, 'cat': 2, 'the': 3, 'mat': 4}


In [6]:
token_ids

[3, 2, 1, 0, 3, 4]

In [3]:
# Convert tokens to their numerical IDs
token_ids = [vocab[token] for token in tokens]
print(f"Token IDs: {token_ids}")


Token IDs: [3, 2, 1, 0, 3, 4]


In [4]:
# --- 2. Embedding ---
# Let's define the size of our embedding vectors (the "meaning space" dimension)
embedding_dim = 4
vocab_size = len(vocab)

# An embedding table is just a lookup matrix.
# Each row corresponds to a token ID's vector.
# In a real model, these numbers are learned during training.
np.random.seed(42)
embedding_table = np.random.rand(vocab_size, embedding_dim)
print("\nEmbedding Table (a learned lookup table):")
print(embedding_table)


Embedding Table (a learned lookup table):
[[0.37454012 0.95071431 0.73199394 0.59865848]
 [0.15601864 0.15599452 0.05808361 0.86617615]
 [0.60111501 0.70807258 0.02058449 0.96990985]
 [0.83244264 0.21233911 0.18182497 0.18340451]
 [0.30424224 0.52475643 0.43194502 0.29122914]]


In [5]:



# Convert our token IDs into their embedding vectors
sentence_embeddings = embedding_table[token_ids]
print(f"\nSentence as a sequence of embeddings (shape: {sentence_embeddings.shape}):")
print(sentence_embeddings)


Sentence as a sequence of embeddings (shape: (6, 4)):
[[0.83244264 0.21233911 0.18182497 0.18340451]
 [0.60111501 0.70807258 0.02058449 0.96990985]
 [0.15601864 0.15599452 0.05808361 0.86617615]
 [0.37454012 0.95071431 0.73199394 0.59865848]
 [0.83244264 0.21233911 0.18182497 0.18340451]
 [0.30424224 0.52475643 0.43194502 0.29122914]]


In [7]:
# Absolute positional embedings
import numpy as np

# Let's use the embeddings from Chapter 1
# sentence: "the cat sat on the mat"
# sentence_embeddings shape: (6, 4) -> 6 tokens, 4 dimensions each
sentence_embeddings = np.random.rand(6, 4)
seq_len, embedding_dim = sentence_embeddings.shape

# --- 1. Absolute Positional Embeddings ---
# Create a learnable table for positions
max_seq_len = 10 # Model can handle up to 10 tokens
positional_embedding_table = np.random.rand(max_seq_len, embedding_dim)

# Get the positional vectors for our sequence (positions 0 through 5)
positional_encodings = positional_embedding_table[:seq_len, :]

# Add them to the word embeddings
final_embeddings_abs = sentence_embeddings + positional_encodings
print("Final embeddings with Absolute Positional info:")
print(final_embeddings_abs.shape)


Final embeddings with Absolute Positional info:
(6, 4)


In [9]:

positional_embedding_table

array([[0.25877998, 0.66252228, 0.31171108, 0.52006802],
       [0.54671028, 0.18485446, 0.96958463, 0.77513282],
       [0.93949894, 0.89482735, 0.59789998, 0.92187424],
       [0.0884925 , 0.19598286, 0.04522729, 0.32533033],
       [0.38867729, 0.27134903, 0.82873751, 0.35675333],
       [0.28093451, 0.54269608, 0.14092422, 0.80219698],
       [0.07455064, 0.98688694, 0.77224477, 0.19871568],
       [0.00552212, 0.81546143, 0.70685734, 0.72900717],
       [0.77127035, 0.07404465, 0.35846573, 0.11586906],
       [0.86310343, 0.62329813, 0.33089802, 0.06355835]])

In [8]:
positional_encodings

array([[0.25877998, 0.66252228, 0.31171108, 0.52006802],
       [0.54671028, 0.18485446, 0.96958463, 0.77513282],
       [0.93949894, 0.89482735, 0.59789998, 0.92187424],
       [0.0884925 , 0.19598286, 0.04522729, 0.32533033],
       [0.38867729, 0.27134903, 0.82873751, 0.35675333],
       [0.28093451, 0.54269608, 0.14092422, 0.80219698]])