<a href="https://colab.research.google.com/github/xprilion/minigpt/blob/main/MiniGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import requests
import os

In [None]:
# --- Hyperparameters ---
batch_size = 32        # How many independent sequences will we process in parallel?
                       # NOTE: If you run out of memory, try reducing this to 16.
block_size = 256       # What is the maximum context length for predictions? (Increased from 128)
max_iters = 5000      # How many training iterations? (Increased from 7000)
eval_interval = 500    # How often to evaluate the model's performance?
learning_rate = 3e-4   # A slightly lower learning rate is often better for transformers
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if available
eval_iters = 200       # How many batches to use for evaluation?
n_embd = 256           # Embedding dimension (Increased from 128)
n_head = 8             # Number of self-attention heads (Increased from 4)
n_layer = 8            # Number of transformer blocks (Increased from 6)
dropout = 0.2          # Dropout rate.

In [None]:
# --- 1. The Dataset ---
# We will download the 'tiny_shakespeare' dataset manually from Karpathy's repo.
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_path = "input.txt"

if not os.path.exists(file_path):
    print("Downloading Tiny Shakespeare dataset...")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(requests.get(url).text)
else:
    print("Tiny Shakespeare dataset already exists locally.")

# Read the dataset
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Dataset loaded. Length of dataset in characters: {len(text)}")


# Create the vocabulary from the entire text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create a mapping from characters to integers and vice-versa
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Create the training and validation data by splitting the raw text
n = int(0.9 * len(text))
train_text = text[:n]
val_text = text[n:]

# Encode the text splits into torch tensors
train_data = torch.tensor(encode(train_text), dtype=torch.long)
val_data = torch.tensor(encode(val_text), dtype=torch.long)


# --- Dataset Class (remains the same) ---
class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        x = chunk[:-1]
        y = chunk[1:]
        return x, y

def get_dataloader(data, batch_size, block_size):
    dataset = TextDataset(data, block_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)


Downloading Tiny Shakespeare dataset...
Dataset loaded. Length of dataset in characters: 1115394


In [None]:
# --- 2. The GPT Model Components (all remain the same) ---

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [5]:
# --- 3. Training the Model (remains the same) ---

print(f"Running on device: {device}")

model = GPTLanguageModel()
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

train_loader = get_dataloader(train_data, batch_size, block_size)
val_loader = get_dataloader(val_data, batch_size, block_size)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        loader = train_loader if split == 'train' else val_loader
        # Use a fresh iterator for evaluation
        data_iter = iter(loader)
        for k in range(eval_iters):
            try:
                X, Y = next(data_iter)
            except StopIteration:
                # This can happen if eval_iters is larger than the number of batches
                break
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses[losses != 0].mean() # Exclude zero losses if StopIteration occurred
    model.train()
    return out

print("Starting training...")
train_iter = iter(train_loader)
for iter_num in range(max_iters):
    if iter_num % eval_interval == 0 or iter_num == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    try:
        xb, yb = next(train_iter)
    except StopIteration:
        train_iter = iter(train_loader)
        xb, yb = next(train_iter)

    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Training finished.")


Running on device: cuda
Starting training...
step 0: train loss 4.3730, val loss 4.3749
step 500: train loss 2.2684, val loss 2.2935
step 1000: train loss 1.8958, val loss 1.9992
step 1500: train loss 1.6860, val loss 1.8364
step 2000: train loss 1.5654, val loss 1.7528
step 2500: train loss 1.4755, val loss 1.6894
step 3000: train loss 1.4207, val loss 1.6326
step 3500: train loss 1.3791, val loss 1.5961
step 4000: train loss 1.3398, val loss 1.5670
step 4500: train loss 1.3142, val loss 1.5479
step 4999: train loss 1.2872, val loss 1.5404
Training finished.


In [6]:

# --- 4. Generate from the Model ---
print("\n--- Generating Text ---")
start_string = "LADY CAPULET: \n Nurse, where's my daughter?"
context = torch.tensor(encode(start_string), dtype=torch.long, device=device).unsqueeze(0)
generated_sequence = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(f"Generated sequence starting with '{start_string}':")
print(generated_sequence)


--- Generating Text ---
Generated sequence starting with 'LADY CAPULET: 
 Nurse, where's my daughter?':
LADY CAPULET: 
 Nurse, where's my daughter? 
My love my birg: what I heard he baniness
All Maria Edwas close
The ext. I thristers of from king.

ROMEO:
Basholing I go my has head, I would to you the year
Friend, content. Your hurself state, doth look, and them
To being the spit and people; &thall greet thee, who
Feen the shabband, tell thee that no morter wastioner, and
tis whilst we dead? A' fall touch often o' the city's face
with her tanchital and thy king wors?

LEONTES:
No, good devill thee, sleep?

Second Murderer:
So long methouth
