In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [2]:
# Hyperparameters
batch_size = 16  # Number of independent sequences processed in parallel
block_size = 32  # Maximum context length for predictions
max_iters = 5000  # Maximum number of iterations
eval_interval = 100  # Evaluation interval
learning_rate = 1e-3  # Learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Device choice (GPU if available, else CPU)
eval_iters = 200  # Number of evaluation iterations
n_embd = 64  # Embedding dimension
n_head = 4  # Number of attention heads
n_layer = 4  # Number of transformer layers
dropout = 0.0  # Dropout rate
torch.manual_seed(1337)  # Random seed for reproducibility


<torch._C.Generator at 0x7907892c3d30>

In [4]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-10-24 03:32:05--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-10-24 03:32:05 (15.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:

# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [18]:
text_length = len(text)
print("Length of dataset in characters:", text_length)


Length of dataset in characters: 1115394


In [19]:
# Display the first 500 characters of the text
print(text[:500])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [21]:
# Data splitting for training and validation
encoded_data = torch.tensor(encode(text), dtype=torch.long)
split_ratio = 0.8  # 80% for training, 20% for validation
split_index = int(split_ratio * len(encoded_data))

train_data = encoded_data[:split_index]  # Training data
val_data = encoded_data[split_index:]    # Validation data


In [22]:
# Data loading function
def get_batch(split):
    # Generate a small batch of input data (x) and target data (y)
    data = train_data if split == 'train' else val_data
    random_indices = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in random_indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in random_indices])
    x, y = x.to(device), y.to(device)
    return x, y


In [23]:
# Data loading function
def get_batch(split):
    # Generate a small batch of input data (x) and target data (y)
    data = train_data if split == 'train' else val_data
    random_indices = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in random_indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in random_indices])
    x, y = x.to(device), y.to(device)
    return x, y


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    """One head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, C)
        q = self.query(x) # (B, T, C)
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # Perform the weighted aggregation of the values
        v = self.value(x)  # (B, T, C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """A simple linear layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    """A super simple bigram language model."""

    def __init__(self):
        super().__init__()
        # Embedding layers
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # Final layers
        self.ln_f = nn.LayerNorm(n_embd)  # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B, T) tensors of integers
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, _ = self(idx_cond)
            # Focus only on the last time step
            logits = logits[:, -1, :]  # (B, C)
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


In [26]:
# Create a Bigram Language Model and move it to the specified device
model = BigramLanguageModel()
model = model.to(device)

# Print the number of parameters in the model
num_parameters = sum(p.numel() for p in model.parameters()) / 1e6
print(f"Number of parameters: {num_parameters:.2f} M")

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # Evaluate the loss on the train and val sets at specified intervals
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Compute the loss and perform optimization
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


Number of parameters: 0.21 M
Step 0: Train loss 4.3561, Val loss 4.3605
Step 100: Train loss 2.6413, Val loss 2.6688
Step 200: Train loss 2.5104, Val loss 2.5264
Step 300: Train loss 2.4271, Val loss 2.4613
Step 400: Train loss 2.3627, Val loss 2.3889
Step 500: Train loss 2.3195, Val loss 2.3468
Step 600: Train loss 2.2574, Val loss 2.2943
Step 700: Train loss 2.2093, Val loss 2.2476
Step 800: Train loss 2.1757, Val loss 2.2188
Step 900: Train loss 2.1221, Val loss 2.1850
Step 1000: Train loss 2.0770, Val loss 2.1432
Step 1100: Train loss 2.0513, Val loss 2.1180
Step 1200: Train loss 2.0266, Val loss 2.0978
Step 1300: Train loss 1.9998, Val loss 2.0716
Step 1400: Train loss 1.9702, Val loss 2.0536
Step 1500: Train loss 1.9507, Val loss 2.0450
Step 1600: Train loss 1.9308, Val loss 2.0224
Step 1700: Train loss 1.9133, Val loss 2.0204
Step 1800: Train loss 1.8994, Val loss 2.0070
Step 1900: Train loss 1.8848, Val loss 1.9863
Step 2000: Train loss 1.8695, Val loss 1.9916
Step 2100: Train 

In [27]:
# Generate text from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_sequence = m.generate(context, max_new_tokens=2000)[0].tolist()
decoded_text = decode(generated_sequence)

# Print the generated text
print(decoded_text)



Untitiusue:
Tears that you for behon?

VOlsget:
And all ears my tiscous.

CORIOLANUM:
Well was plucks:.

MO
givant:
Cayer wilt groon! I andam;
Our batten Arman: I loy; what I wear wife?

COMINIUS:
Breyself my lough beriant
Of sgentermames this ats--lick, and, lord'ss,
to-days MinertatiAnd Secrion that
The woat the mare is than no as men tonterreip good or my thou us maste to this whose not ome:
Plhar tonge is
thyself;
I harged I canning and that fortune.
That now set heart!
The trunday, I lovess,
But To art Glood infatutor landemved:
were for in the dranted arm my mile;
Or tower thous has ungry, wife broyen, mistress,
How it adunterens that your gropate:
The niession the grace of love in that he for broth worldom crown:
To are'er notes, buts youlls!
Nune's bout good too notle, Warwient thyself
Briart--sit is to to meath, away for be a unnow this make.

MERCUTiLe:
Now what them crauce is would givest my art
To hath boy your had thy pirciefition
Some me? To begainst arm?

Nay, BIive, mi