In [2]:
import time
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F



### 1. Load the dataset

In [3]:
# Let's download the tiny shakespeare dataset from Karpathy's github!
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

# Let's load the dataset into memory
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# let's look at the first 100 characters
print('----')
print(text[:100])

# let's print the length of the dataset
print('----')
print(f"Length of dataset: {len(text)} characters")

--2024-11-12 14:51:04--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-11-12 14:51:04 (7.31 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

----
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
----
Length of dataset: 1115394 characters


### 2. Tokenize the dataset (Character-based)

In [4]:
chars=sorted(set(text))
vocab_size=len(chars)

print(chars)
print('---')
print(f"Vocabulary size: {vocab_size} characters")

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
---
Vocabulary size: 65 characters


In [5]:
# create a dictionary to map characters to integers and vice versa
c2i={c:i for i, c in enumerate(chars)} # character to integer
i2c={i:c for i, c in enumerate(chars)} # integer to character

# encode a string to a "list of integers"
encode=lambda s: [c2i[c] for c in s]

# decode a list of integers to a string
decode=lambda l: ''.join([i2c[i] for i in l])

# Let's test it out
print(encode("hello"))
print(decode(encode("hello")))

[46, 43, 50, 50, 53]
hello


In [6]:
# Tokenize the dataset (and put it in a pytorch tensor)
data = torch.tensor(encode(text), dtype=torch.long)
print(f'data.shape={data.shape}, data.dtype={data.dtype}')
print('---')
print(data[:100])

data.shape=torch.Size([1115394]), data.dtype=torch.int64
---
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


#### 2.a) Let's split the dataset into training and validation sets

In [7]:
# 90% - training set, 10% - validation set
n = int(0.9*len(data))
train_data = data[:n]
val_data   = data[n:]

#### 2.b) Let's see how to create input and targets during the training
The goal of training a transformer language model is to predict "the next token in the sequence" given "the current sequence of tokens".

**Example:**
`Hello World! I am ChatGPT!`

**Prediction:**\
Input: A sequence of tokens -> Output: The next token in the sequence
___
##### *Approach 1:*
**Prediction 1:**
Input: `[H, E, L, L, O, W, O, R]` -> Output: `[L]`
___
##### *Approach 2 (Sliding Window):*    
**Prediction 1:**
Input: `[H]` -> Output: `[L]`

**Prediction 2:**
Input: `[H, E]` -> Output: `[L]`

**Prediction 3:**
Input: `[H, E, L]` -> Output: `[L]`

...

**Prediction 8:**
Input: `[H, E, L, L, O, W, O, R]` -> Output: `[L]`

The sliding window approach:
- Creates multiple training examples from a single input
- While the sequence length is fixed, this teaches the model to handle contexts of different lengths -> critical for flexibility of the model
- Can be done in parallel without pulling new data from the memory
- Because of progressive learning, the training becomes more smooth and stable.
- Improves the model understanding of contextual relationships between tokens

___

So, here to implement the sliding window, we create the input and output via the format below:
- **Input:**`[H, E, L, L, O, W, O, R]`
- **Target:** `[E, L, L, O, W, O, R, L]`

We can see how this is actually implemented in the self-attention section.

In [20]:
# Define the sequence size
# These are just small values for testing, later we will use a larger value
seq_size   = 8 # how many characters are in a sequence
batch_size = 4 # how many sequences in a batch

def get_batch(split):
    data = train_data if split=='train' else val_data

    # Randomly select starting indices for the batch (0, len(data)-seq_size)
    # Do it for batch_size number of times and store in a torch vector
    start_ind = torch.randint(0,len(data)-seq_size,(batch_size,))
    #print(f'start_ind={start_ind}')
    
    context=[]
    target=[]
    for i in start_ind:
        context.append(data[i  :i+seq_size  ])
        target.append (data[i+1:i+seq_size+1])
    
    context = torch.stack(context) #converts from a list of tensors to a large tensor
    target  = torch.stack(target)

    return context, target

context,target=get_batch('train')

# Printing some properties of what we built
print('---')
print(f'seq_size={seq_size}, batch_size={batch_size}')
print(f'context.shape={context.shape}, target.shape={target.shape}')

---
seq_size=8, batch_size=4
context.shape=torch.Size([4, 8]), target.shape=torch.Size([4, 8])


In [9]:
print(f'context={context}')
print(f'target ={target}')

context=tensor([[ 1, 52, 53, 58,  1, 58, 46, 43],
        [50, 40, 53, 63, 57,  1, 44, 56],
        [ 1, 39, 52, 57, 61, 43, 56,  1],
        [47, 58,  1, 61, 53, 59, 50, 42]])
target =tensor([[52, 53, 58,  1, 58, 46, 43, 63],
        [40, 53, 63, 57,  1, 44, 56, 53],
        [39, 52, 57, 61, 43, 56,  1, 52],
        [58,  1, 61, 53, 59, 50, 42,  1]])


#### Let's see it in action

In [10]:
b=0 #for example, at a fixed batch id
print(f'sequence: "{decode(context[b,:].tolist())}"')
for t in range(seq_size):
    input_seq = context[b,:t+1].tolist()
    output_seq=[target[b,t].tolist()]
    print(f'{decode(input_seq)} --> {decode(output_seq)}')

print('---in numbers---')
for t in range(seq_size):
    print(f'input: {context[b,:t+1]} --> output: {target[b,t]}')

sequence: " not the"
  --> n
 n --> o
 no --> t
 not -->  
 not  --> t
 not t --> h
 not th --> e
 not the --> y
---in numbers---
input: tensor([1]) --> output: 52
input: tensor([ 1, 52]) --> output: 53
input: tensor([ 1, 52, 53]) --> output: 58
input: tensor([ 1, 52, 53, 58]) --> output: 1
input: tensor([ 1, 52, 53, 58,  1]) --> output: 58
input: tensor([ 1, 52, 53, 58,  1, 58]) --> output: 46
input: tensor([ 1, 52, 53, 58,  1, 58, 46]) --> output: 43
input: tensor([ 1, 52, 53, 58,  1, 58, 46, 43]) --> output: 63


___
**From here, we will start building the blocks of the transformer!**

<div align="center">
<img src="assets/transformer.png" width="200" alt="Transformer">
</div>


**Note:** 

*Input context* has the shape [Batch, Sequence Length] or `[B,T]`

*Embedding tensor* would be of shape [Batch, Sequence Length, Channel] or `[B,T,C]`

### 3. Embedding
Here, we are using a simple trainable embedding. It is essentially a look up table that for each "token", stores the "embedding".

In [11]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        #
        # nn.Embedding is a lookup table that stores embeddings of a fixed dictionary and size.
        # It takes two arguments:
        # 1. the size of the dictionary of embeddings: vocab_size  ; here we have 65 different tokens
        # 2. the size of each embedding vector: embed_dim          ; how many numbers are used to describe each token
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embed_dim)
    
    # taking the input tensor and returning the embedded tensor
    def forward(self,input_context):
        embedding_table=self.token_embedding_table(input_context)
        #
        print(f'input_contex.shape={input_context.shape}')
        print(f'embedding_table.shape={embedding_table.shape}')

        return embedding_table

In [12]:
# Size of the vocabulary in the our text (all unique characters, words, ...)
# Determined earlier vocab_size=65

# Embedding dimension
embed_dim=65

# using above class to create the embedding
org_embedding    = TokenEmbedding(vocab_size,embed_dim)
org_embd_context = org_embedding(context)

input_contex.shape=torch.Size([4, 8])
embedding_table.shape=torch.Size([4, 8, 65])


### 4. Self attention
<div align="center">
<img src="assets/sa.png" width="150" alt="Self Attention">
</div>


*Inputs to the SelfAttention:* Embedded context `[B,T,C]`

1- Create **linear transformation matrices** for **Key, Query, Value** matrices.\
*Note:* We build the $W_K$, $W_Q$, $W_V$ matrices for all tokens in **a sequence**.\
These are of shape `[C,C]` and applied in parallel across the batch.\

2- Use **Query** and **Key** matrices to create the base attention score\
*Note:* $Q$ = $W_Q$ * (Embedded context), $K$ = $W_K$ * (Embedded context)\
$Q$ and $K$ have shape `[B,T,C] @ [C,C] = [B,T,C]`

base_scores = `Q @ K.T` will be of dimension: `[B,T,C]*[B,C,T] = [B,T,T]`

3- Scale it with the $\sqrt{d_{emb}}$, where $d_{emb}$ in the code is referred to as `C` (Channel)

___
**Let's talk about Masking:** Let's revisit our input before entering the transformer block. Imagine B=1:

context input: `[H, E, L, L, O, W, O, R]`\
target output: `[E, L, L, O, W, O, R, L]`

```
H -> [0.1, 0.2, ..., 0.0]  # C=65 numbers\
E -> [0.3, 0.1, ..., 0.2]  # C=65 numbers\
context = [
    [0.1, 0.2, ..., 0.0],  # H
    [0.3, 0.1, ..., 0.2],  # E
    [...],                  # L
    [...],                  # L
    [...],                  # O
    [...],                  # W
    [...],                  # O
    [...],                  # R
]  # shape: [B=1, T=8, C=65]

Q = context @ Wq  # [1, 8, 65]
K = context @ Wk  # [1, 8, 65]

scores = [
    # H    E    L    L    O    W    O    R
    [H·H, H·E, H·L, H·L, H·O, H·W, H·O, H·R],  # H's interactions
    [E·H, E·E, E·L, E·L, E·O, E·W, E·O, E·R],  # E's interactions
    [L·H, L·E, L·L, L·L, L·O, L·W, L·O, L·R],  # L's interactions
    [L·H, L·E, L·L, L·L, L·O, L·W, L·O, L·R],  # L's interactions
    [O·H, O·E, O·L, O·L, O·O, O·W, O·O, O·R],  # O's interactions
    [W·H, W·E, W·L, W·L, W·O, W·W, W·O, W·R],  # W's interactions
    [O·H, O·E, O·L, O·L, O·O, O·W, O·O, O·R],  # O's interactions
    [R·H, R·E, R·L, R·L, R·O, R·W, R·O, R·R],  # R's interactions
]  # shape: [1, 8, 8] or [B, T, T]
```
Remember that we want to have predictions for the next token via a sliding window approach:
- At step 0 (first row), we want the input to be `[H]` and the target to be `[E]`
- At step 1 (second row), we want the input to be `[H, E]` and the target to be `[L]`
- ...
- At step 7 (last row), we want the input to be `[H, E, L, L, O, W, O, R]` and the target to be `[L]`

So, we essentially want the scores to be like:
```
scores = [
    # H    E    L    L    O    W    O    R
    [H·H, 0  ,  0  , 0  , 0  , 0  , 0  , 0 ],  # H's interactions
    [E·H, E·E,  0  , 0  , 0  , 0  , 0  , 0 ],  # E's interactions
    [L·H, L·E, L·L,  0  , 0  , 0  , 0  , 0 ],  # L's interactions
    [L·H, L·E, L·L, L·L,  0  , 0  , 0  , 0 ],  # L's interactions
    [O·H, O·E, O·L, O·L, O·O,  0  , 0  , 0 ],  # O's interactions
    [W·H, W·E, W·L, W·L, W·O, W·W,  0  , 0 ],  # W's interactions
    [O·H, O·E, O·L, O·L, O·O, O·W, O·O,  0 ],  # O's interactions
    [R·H, R·E, R·L, R·L, R·O, R·W, R·O, R·R],  # R's interactions
]  # shape: [1, 8, 8] or [B, T, T]
```
This is why we need to mask the future tokens. Instead of setting them to 0, we set them to $-\infty$ so that after the softmax, they become 0.
___


4- Masking the future tokens with $-\infty$ \
5- Softmax:
$$\text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}}$$
6- Multiply by Value matrix to create the attention weights

In [13]:
class SelfAttention(nn.Module):
    def __init__(self,embed_dim):
        super().__init__()

        ### 1) q, k, v linear transformation matrices
        self.query = nn.Linear(embed_dim, embed_dim, bias=False)
        self.key   = nn.Linear(embed_dim, embed_dim, bias=False)
        self.value = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self,context):
        q = self.query(context) # dimension: (B, T, C)
        k = self.key(context)   # dimension: (B, T, C)
        v = self.value(context) # dimension: (B, T, C)

        ### 2) Q @ K^T
        # We need to keep the first dimension to be the batch size: "B"
        # PyTorch will be able to perform matmul in parallel (at different batch indices)
        # We need to transpose the other two indices (-2 and 01)
        
        print('q.shape,k.shape=',q.shape,k.shape)
        k_transposed = k.transpose(-2, -1) #swaps the dim "-2" with dim "-1"
        print('k_transposed.shape=',k_transposed.shape)
        
        ### 2-3) scaling the attention scores
        embed_dim = q.shape[-1]
        scores = q @ k_transposed * (embed_dim ** -0.5)  # Scale by sqrt(d_k) # [B, T, T]

        ### 4) Masking
        # We want to mask the next tokens from affecting the current token
        # at i, we have to mask j>i --> similar to a lower triangular matrix
        # We set them to -inf so after softmax, they become 0
        
        print('---')
        print('scores at B=0, before masking: \n', scores[0,:,:],'\n ---')
        seq_size = q.shape[-2]
        self.tril=torch.tril(torch.ones(seq_size, seq_size))
        
        scores = scores.masked_fill(
            self.tril[:seq_size, :seq_size] == 0, float('-inf')
            ) # (B, T, T)
        print('scores at B=0, after masking: \n', scores[0,:,:],'\n ---')

        ### 5) applying the softmax
        # scores has the dimension [B, T, T]
        # we want to normalize the score for each query position independently
        # so, softmax is applied along the last dimension

        scores = F.softmax(scores, dim=-1) #[B, T, T]
        print('scores at B=0, after softmax: \n', scores[0,:,:],'\n ---')

        ### 6) multiplying by v matrix: [B, T, C]
        scores = scores @ v

        return scores # B, T, C]

SA=SelfAttention(embed_dim)
scores=SA(org_embd_context)        

q.shape,k.shape= torch.Size([4, 8, 65]) torch.Size([4, 8, 65])
k_transposed.shape= torch.Size([4, 65, 8])
---
scores at B=0, before masking: 
 tensor([[-4.5996e-01,  6.8997e-01, -5.2011e-01, -4.3751e-01, -4.5996e-01,
         -4.3751e-01, -3.0724e-01,  8.6585e-02],
        [-3.7844e-01,  2.4135e-02,  1.4577e-01,  5.0176e-02, -3.7844e-01,
          5.0176e-02, -2.1089e-01, -1.1838e-01],
        [ 2.8905e-01, -1.9408e-01, -2.9102e-01,  5.0614e-01,  2.8905e-01,
          5.0614e-01,  9.8043e-01,  6.7601e-02],
        [ 2.8141e-01,  1.6873e-01, -2.9702e-01, -2.4251e-01,  2.8141e-01,
         -2.4251e-01, -6.0282e-01, -9.0550e-05],
        [-4.5996e-01,  6.8997e-01, -5.2011e-01, -4.3751e-01, -4.5996e-01,
         -4.3751e-01, -3.0724e-01,  8.6585e-02],
        [ 2.8141e-01,  1.6873e-01, -2.9702e-01, -2.4251e-01,  2.8141e-01,
         -2.4251e-01, -6.0282e-01, -9.0550e-05],
        [ 1.0262e-01, -6.1391e-02,  2.1577e-01,  1.2711e-01,  1.0262e-01,
          1.2711e-01, -3.7851e-02, -4.4804e-0

### 5. Multi-head attention
<div align="center">
<img src="assets/mha.png" width="250" alt="Multi-head Attention">
</div>

1- Specify the number of heads `n_heads` \
2- Create a SingleHeadAttention where we split the Q, K, V matrices, along embed_dim, into `n_heads` \
3- Concatanate the Value matrices \
    > 3a- First create a list of the heads in initialize, using torch.Module() \
    > 3b- concatanate them in the `forward` method \
4- Add an additional Linear layer on the top of it (called projection) \
5- Create a dropout layer

This allows the PyTorch to be able to track the model and parameters, etc better.

In [14]:
# 1) Specify the number of heads
n_heads = 5
head_size = embed_dim // 5

dropout = 0. # The dropout fraction!

# 2) Single head attention
class SingleHeadAttention(nn.Module):
    def __init__(self,embed_dim,head_size):
        super().__init__()
        self.query = nn.Linear(embed_dim, head_size, bias=False)
        self.key   = nn.Linear(embed_dim, head_size, bias=False)
        self.value = nn.Linear(embed_dim, head_size, bias=False)

    def forward(self,context):
        q = self.query(context) # dimension: [B, T, C']
        k = self.key(context)   # dimension: [B, T, C']
        v = self.value(context) # dimension: [B, T, C']

        seq_size  = q.shape[-2]
        embed_dim = q.shape[-1]

        k_transposed = k.transpose(-2, -1) #swaps the dim "-2" with dim "-1"
        scores = q @ k_transposed * (embed_dim ** -0.5)  # Scale by sqrt(d_k) # [B, T, T]
        
        self.tril=torch.tril(torch.ones(seq_size, seq_size)).to(scores.device)
        scores = scores.masked_fill(
            self.tril[:seq_size, :seq_size] == 0, float('-inf')
            ) # (B, T, T)
        
        scores = F.softmax(scores, dim=-1) #[B, T, T]
        scores = scores @ v #[B, T, T] x [B, T, C'] = [B, T, C']

        return scores #[B, T, C']

class MultiHeadAttention(nn.Module):
    def __init__(self,embed_dim,n_heads):
        super().__init__()
        
        # 3a) Creating a list of SingleHeadAttention
        # Not just a simple list, but a ModuleList. 
        # It allows the model to track the parameters
        head_size = embed_dim//n_heads
        self.heads = nn.ModuleList([SingleHeadAttention(embed_dim,head_size) 
                                for _ in range(n_heads)])
        
        # 4) Adding an additional linear layer (called projection)
        # Will take the concatanated attentions [B,T,C] 
        # Its output should have the same dimension as the embedded context [B, T, C]
        self.proj  = nn.Linear(embed_dim,embed_dim)

        # 5) Adding the residual with dropout
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,context):

        # 3b) concatanating the list of SingleHeads modules
        heads_list=[]
        for head in self.heads:
            heads_list.append(head(context))
        heads = torch.cat(heads_list,dim=-1)
        #print(f'heads.shape={heads.shape}\n ---')

        # 4) adding the linear layer, called projection
        scores=self.proj(heads) # now score is [B, T, C]

        # 5) adding the dropout layer
        scores = self.dropout(scores)

        return scores #[B, T, C]
      
MHA = MultiHeadAttention(embed_dim,n_heads)
scores = MHA(org_embd_context)

### 6. FeedForward Layer
A simple 2-layer network to add non-linearity and complexity to the system

1- Increasing the embedding dimension by 4x \
2- Applying ReLU \
3- Decreasing the embedding dimension by 4x \
4- Adding a Dropout

In [15]:
class FeedForward(nn.Module):
    def __init__(self,embed_dim):
        super().__init__()

        # The input to the network is [B, T, C]
        self.network=nn.Sequential(
            # 1) Increasing the dimension to [B, T, 4C]
            nn.Linear(embed_dim, 4*embed_dim),
            #
            # 2) ReLU
            nn.ReLU(),
            #
            # 3) decreasing the dimension back to [B, T, C]
            nn.Linear(4*embed_dim, embed_dim),
            #
            # 4) Adding the dropout
            nn.Dropout(dropout),
        )
    
    def forward(self, scores):
        return self.network(scores)

### 7. Putting together the attention block!

We first need to understand:
**What is the Normalizing layer?**

It normalizes the output of the previous layer (to have a mean of 0 and a standard deviation of 1) before going to the next layer.
- Stabilizes the learning process
- Prevents the exploding/vanishing gradients

Although we used Softmax in the self-attention, the Value matrix values may be too large, so we need to normalize the output of the self-attention -> LayerNorm.

**Steps to put together the attention block:**

1- Instantiating the MHA \
2- Creating the layer normalization LN1 to take the "embedded context" as residual and add to the attention scores \
3- Instantiating the FF \
4- Creating the normalizing layer2 to take the MHA output as residual and add to the FF outputs \
5- Connecting the normalizing layer 1 and MHA output\
6- Connecting the normalizing layer 2 and FF output

In [16]:
class AttentionBlock(nn.Module):
    def __init__(self,embed_dim,n_heads):
        super().__init__()

        # 1) Instantiating the MHA 
        # Input : [B, T, C]
        # Output: [B, T, C]
        self.mha = MultiHeadAttention(embed_dim,n_heads)

        # 2) Creating the layer normalization module 1
        # Input: Embedded context [B, T, C]
        # Output: will be summed up with the MHA output [B, T, C]
        self.ln1 = nn.LayerNorm(embed_dim)

        # 3) Instantiating the FF
        self.ff = FeedForward(embed_dim)

        # 4) Creating the layer normalization module 2
        # Input: Output of MHA [B, T, C]
        # Output: will be summed up with the FF output [B, T, C]
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self,x):
        # 5) Connecting layer norm 1 and MHA
        x = x + self.mha(self.ln1(x))

        #6) Connecting layer norm 2 and FeedForward
        return x + self.ff(self.ln2(x))


### 8. Finalizing the Model

1- Instantiating the Token Embedding \
2- Also, Instantiating a similar Embedding for Positional Embedding (instead of sin encoding) \
3- Instantiating the attention blocks n_layer times \
4- Adding a Layer Normalization at the end \
5- Adding a Linear Layer at the end to go from embed_dim to vocab_size \
6- Applying the softmax
* Note: In training we don't need to apply the softmax directly, because CrossEntropy does this for us!


In [22]:
batch_size=16
seq_size=16
embed_dim=128
n_heads=4
n_layers=4
dropout=0.
learning_rate = 0.0005
n_epoch = 2000
print_interval=100
eval_iters=200

class Transformer(nn.Module):
    def __init__(self,embed_dim,n_heads):
        super().__init__()

        # 1) Instantiating Token embedding
        self.TokenEmbedding = nn.Embedding(vocab_size, embed_dim) # output: [B,T,C]

        # 2) Instantiating Positional embedding
        self.PositionalEmbedding = nn.Embedding(seq_size, embed_dim) #output: [T,C]

        # 3) Instantiating Attention block
        # Creating a list of Block modules
        block_list = []
        for _ in range(n_layers):
            block = AttentionBlock(embed_dim, n_heads)
            block_list.append(block)
        # Convert the list into a Sequential container
        self.blocks = nn.Sequential(*block_list) # * unpacks the list

        # 4) Last Layer Normalization module
        self.lnn = nn.LayerNorm(embed_dim)

        # 5) Last Linear layer to go from [B,T,C] to [B,T,vocan_size]
        self.linearn = nn.Linear(embed_dim,vocab_size)
    
    def forward(self,context, targets=None):

        context = context.to(device)
        if targets is not None: targets = targets.to(device)
        
        batch_size = context.shape[0]
        seq_size   = context.shape[1]
        #vocab_size from the global

        
        # 1) Token embedding
        tm = self.TokenEmbedding(context) # [B, T, C]

        # 2) Positional embedding
        # create a sequence of int numbers of 0 to vocab_size-1
        tmp = torch.arange(seq_size, dtype=torch.int64).to(device) 
        pm  = self.PositionalEmbedding(tmp)

        # Adding the two embeddings
        x = tm + pm

        # 3) Calculate the blocks
        x = self.blocks(x)

        # 4 and 5) Pass the data through lnn and last linear layer
        x = self.lnn(x)
        y = self.linearn(x) # output: [B, T, vocab_size]

        # If there are targets, it's training otherwise is inference
        if targets is None:
            loss = None
        else:
            y       = y.view(batch_size*seq_size, vocab_size)
            targets = targets.view(batch_size*seq_size)
            loss    = F.cross_entropy(y, targets)

        return y, loss
    
    def generation(self, context, max_tokens):
        # context has dimensions of [B, T]
        for _ in range(max_tokens):
            # make sure the context fits in the sequence length
            context_crop = context[:, -seq_size:]
            
            # get the predictions
            y, loss = self(context_crop)

            # focus only on the last token
            y = y[:, -1, :] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(y, dim=-1) # (B, C)

            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append the sample to the running sequence
            context = torch.cat((context, next_token), dim=1) # (B, T+1)
        return context       

### 9. Training

In [None]:
## HOMEWORK: Implement the estimate_loss function


In [None]:
# Instantiating the Transformer module
model = Transformer(embed_dim,n_heads)

if torch.cuda.is_available():
    device = 'cuda'
else: 
    device ='cpu'

model = model.to(device)

## HOMEWORK: Print the number of parameters in the model

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

# Training Loop
start=time.time()
for epoch in range(n_epoch):
    
    # get a batch of data
    

    # pass through the model (context,target)
    

    if epoch % print_interval == 0 or epoch == n_epoch - 1:
        ## HOMEWORK: Calculate the training and validation loss using the estimate_loss function
        print(f"step {epoch}: train loss {losses['train']}, val loss {losses['val']}")

    # calculate the gradients
    

    # backward propagations
    

    # Optimizer step
    

print(f'Training took {time.time()-start} seconds')

In [25]:
# generate from the model
start=time.time()
context  = torch.zeros((1, 1), dtype=torch.long, device=device)
response = model.generation(context, max_tokens=1000)[0].tolist()
print(f'Inference took {time.time()-start} seconds')
print('---')
print(decode(response))

Inference took 7.255411148071289 seconds
---

And Bust thepwell, me.

MONGERIO:
O, your consaid,
Was on redge on'sble your to:
Twas and mustring, to cosery thee; I called
To Bothan the hramelifter's stage? fill fend think's than. Will in the .

GlOLISABETHERMIO:
To is bid not!

SeCame, good throped sumis shivil, that inray gralacemer o freardis
God heartly brease-pook rit's dees med,
Tell thy looded-teaven you her and not yone, geven the shall you, but son firmed?

HENRIVALRENANE:
But his tisted conders,
Noyle saad sestand mine I trong peds doed Comorld.

ESMOLI:
Frieven to call seeper awere somear
Will didses off ither heavarttent your go?

Lecrupled shon ceives
Evece, a meock of have comched that I slind the
Jal is peopord I'll wan take heaved?
Ahd mancloony name to hive scoces'd bown:
God supos. KING'HALD EDICHANG:
O, metto I have ear our or how aftents in beight.

THAR HIONGABALLA:
Woulk, marry the
I gaintle hmeads becen ful my py fanlesse! I to we thee did;
Undrencentusemough meec