In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


### Hyper-parameters
```blk_size```  - implying how many characters will be taken into consideration for context to predict the next character. Once the character is predicted, then the block moves by 1 char, and the next 8 charcters (including the last predicted character) is taken into context and so on.  
```batch_size``` - number of blocks that will be stacked together.  
```chkpt``` - Value to check progress at particular number of iterations.  
```max_iter``` - number of times the training loop will run.  
```lr``` - learning rate  
```n_embed``` - length of embedded vector  
```n_EDlayers``` - number of encoder and decoders in the Transformer architecture  
```n_head``` - number of heads for multihead attention  
```dropout``` - percentage of nodes in NN dropping out, for regularization

In [2]:
blk_size = 64
batch_size = 128 # 32 blocks stacked together
chkpt = 100 # Checkpoint for every 100 iterations
max_iter = 1000   # maximum number of iterations
lr = 3e-4              # Learning rate
n_embed = 384     # dimension of embedding vector
n_EDlayers = 8 # number of encoders and decoders in the a
n_head = 8
dropout = 0.2

### Data Handling

In [3]:
char = ""
with open('Wizard_of_Oz.txt', 'r', encoding='utf-8') as f:
    txt = f.read()
    chars = sorted(list(set(txt)))
 
vocab_size = len(chars)

In [4]:
# character-level tokenization

str_to_int = {ch : i for i,ch in enumerate(chars)}
int_to_str = {i : ch for i,ch in enumerate(chars)}
enc = lambda x : [str_to_int[c] for c in x]  # encoding char to int
dec = lambda x : ''.join([int_to_str[i] for i in x])  # decoding int to char

# example
print(enc('wizard'))
print(dec(enc('wizard')))

[76, 62, 79, 54, 71, 57]
wizard


In [5]:
# encoding whole text into tensors

data = torch.tensor(enc(txt), dtype=torch.long)
print(data[:100])

tensor([ 1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,
         0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,
         0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36,
        25, 38, 28,  1, 39, 30,  1, 39, 50,  9])


#### Splitting data into train and validation

In [6]:
# splitting data into train and validation
n = int(0.8*len(data))
train_d = data[:n]
val_d = data[n:]

In [7]:
# Splitting the data in blocks
x = train_d[:blk_size]
y = train_d[1:blk_size+1]

# for i in range(blk_size):
#     context = x[:i+1]
#     target = y[i]
#     print('When input is', context, 'target is', target)

In [8]:
def get_batch(part):
    d = train_d if part == 'train' else val_d
    idx = torch.randint(len(d) - blk_size, (batch_size,))
    # print(idx)
    x = torch.stack([d[i:i+blk_size] for i in idx])
    y = torch.stack([d[i+1:i+1+blk_size] for i in idx])
    return x.to(device), y.to(device)

# x, y = get_batch('train')
# print('inputs:', x)
# print('targets:', y)

# Initializing Neural Net

In [9]:
class Head(nn.Module):
    ''' self attention head'''

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(blk_size, blk_size))) # for masking to avoid lookahead
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        B, T, C = X.shape
        k = self.key(X) # (B, T, head_size)
        q = self.query(X) # # (B, T, head_size)
        # computing attention score
        wght = q @ k.transpose(-2,-1) * k.shape[-1]**0.5 # Flipping last two dim, and scaling by 1/sq.root
        wght = wght.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wght = F.softmax(wght, dim=-1)
        wght = self.dropout(wght)

        # perform weihted augmentation on values
        v = self.value(X)
        return wght @ v

In [10]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(head_size * n_heads, n_embed) 
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        out = torch.cat([h(X) for h in self.heads], dim=-1) # dim -> (B, T, 4*features)
        out = self.dropout(self.proj(out))
        return out

In [11]:
class FeedForward(nn.Module):

    def __init__(self, n_embed):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.ReLU(),
            nn.Linear(4*n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, X):
        return self.network(X)

In [12]:
class Block(nn.Module):

    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head   # how many features does each head capture
        self.ma = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        y = self.ma(x)
        x = self.ln1(x + y)   # Adding and normalizing
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x    

In [13]:
class GPTLangModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, n_embed)   # token embedding lookup table
        self.pos_embed_table = nn.Embedding(blk_size, n_embed)   # positional embedding table
        self.layers = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_EDlayers)])
        self.l_fin = nn.LayerNorm(n_embed)       # Final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)    # Final linear layer after encoders-decoders
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.2)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.2)
    
    def forward(self, idx, targets=None):
        batch, time = idx.shape
        
        # idx and targets are tensors of shape (batch*time)
        tok_emb = self.token_embed(idx)
        pos_emb = self.pos_embed_table(torch.arange(time, device=device))
        x = tok_emb + pos_emb
        x = self.layers(x)
        x = self.l_fin(x) 
        logits = self.lm_head(x)
        
        
        if targets is None:
            loss = None
        else:
            batch, time, channels = logits.shape       # time is a sequential dimension, channels = vocab size (or) number of classes 
            logits = logits.view(batch*time, channels)   # reshaping logits(B,T,C) dimensions to (B*T, C)
            targets = targets.view(batch*time)    
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    
    def generate(self, idx, maxNewTokens):

        for _ in range(maxNewTokens):
            logits, loss = self.forward(idx)
            logits = logits[:, -1, :]    # focusing on last timestep as it is a bigram model (single prev char)
            probs = F.softmax(logits, dim = -1)   # get the probabilities of last dimension
            idxNxt = torch.multinomial(probs, num_samples=1) # sample from those probabilities
            idx = torch.cat((idx, idxNxt), dim=1)
        
        return idx
    
        


model = GPTLangModel(vocab_size)
m = model.to(device)

# context = torch.zeros((1,1), dtype=torch.long, device=device) # initial context is 0
# gen_chars = dec(m.generate(context, maxNewTokens=500)[0].tolist()) # generate first 500 tokens
# print(gen_chars)

In [14]:
# loss function
@torch.no_grad()

def calc_loss():
    out = {}
    
    model.eval()
    
    for i in ['train', 'val']:
        losses = torch.zeros(chkpt)
        
        for j in range(chkpt):
            X, y = get_batch(i)
            logits, loss = model(X, y)
            losses[j] = loss.item()
        
        out[i] = losses.mean()    
    
    model.train()
    
    return out

In [15]:
# Training loop

optim = torch.optim.AdamW(model.parameters(), lr=lr) # defining optimizer

for i in range(max_iter):
    if i % chkpt == 0:
        losses = calc_loss()
        print(f"Epoch: {i} - Train loss: {losses['train']:.4f}, Validation loss: {losses['val']:.4f}")
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

Epoch: 0 - Train loss: 9.1631, Validation loss: 9.2070
Epoch: 100 - Train loss: 3.2412, Validation loss: 3.2519
Epoch: 200 - Train loss: 3.2101, Validation loss: 3.2251
Epoch: 300 - Train loss: 3.1853, Validation loss: 3.1934
Epoch: 400 - Train loss: 3.1689, Validation loss: 3.1823
Epoch: 500 - Train loss: 3.1569, Validation loss: 3.1670
Epoch: 600 - Train loss: 3.1535, Validation loss: 3.1602
Epoch: 700 - Train loss: 3.1468, Validation loss: 3.1585
Epoch: 800 - Train loss: 3.1472, Validation loss: 3.1556
Epoch: 900 - Train loss: 3.1414, Validation loss: 3.1547
3.160789966583252


In [50]:
m = model.to(device)
context = torch.zeros((1,1), dtype=torch.long, device=device) # initial context is 0
gen_chars = dec(m.generate(context, maxNewTokens=500)[0].tolist()) # generate first 500 tokens
print(gen_chars)

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], t

RuntimeError: The size of tensor a (128) must match the size of tensor b (129) at non-singleton dimension 2

,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [18,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [19,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [20,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [21,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [22,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [23,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexin