# Phase 4, Lesson 2: Training Your GPT[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/suraaj3poudel/Learn-To-Make-GPT-Model/blob/main/phase4_build_your_gpt/02_training_your_gpt.ipynb)Train your GPT model on real text! 📚## What You'll Learn1. Data preparation and tokenization2. Training loop with AdamW optimizer3. Gradient accumulation and mixed precision4. Saving and loading checkpoints5. Evaluating perplexityLet's train it!

In [None]:
# Setupimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom torch.utils.data import Dataset, DataLoaderimport numpy as npimport matplotlib.pyplot as plt# Import GPT from previous lesson (you'd normally save it in a separate file)print('✅ Ready to train GPT!')

## 1. Prepare Training DataWe'll use a simple text dataset. In practice, you'd use much more data!

In [None]:
# Sample training data (in practice, use much more!)text_data = """Artificial intelligence and machine learning are transforming technology.Neural networks learn patterns from data through training.Deep learning uses multiple layers to extract features.Transformers revolutionized natural language processing.GPT models generate coherent and contextual text.Attention mechanisms allow models to focus on relevant information.Language models predict the next word in a sequence.Training requires large datasets and computational power.Embeddings represent words as dense vectors.Self-attention computes relationships between all tokens.Fine-tuning adapts pre-trained models to specific tasks.Modern AI systems can understand and generate human language.Text generation creates meaningful and fluent output.Machine translation converts text between languages.Question answering systems extract information from text.""" * 20  # Repeat for more dataprint(f"Training text length: {len(text_data)} characters")print(f"Sample: {text_data[:200]}...")

## 2. TokenizationConvert text to tokens. We'll use simple character-level tokenization.

In [None]:
class CharTokenizer:    def __init__(self, text):        # Get unique characters        chars = sorted(list(set(text)))        self.vocab_size = len(chars)                # Create mappings        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}        def encode(self, text):        return [self.char_to_idx[ch] for ch in text]        def decode(self, indices):        return ''.join([self.idx_to_char[i] for i in indices])# Create tokenizertokenizer = CharTokenizer(text_data)print(f"Vocabulary size: {tokenizer.vocab_size}")print(f"Vocabulary: {''.join(list(tokenizer.char_to_idx.keys())[:50])}...")# Encode dataencoded_data = torch.tensor(tokenizer.encode(text_data), dtype=torch.long)print(f"\nEncoded data shape: {encoded_data.shape}")print(f"First 50 tokens: {encoded_data[:50]}")

## 3. Create DatasetPyTorch Dataset for training

In [None]:
class TextDataset(Dataset):    def __init__(self, data, block_size):        self.data = data        self.block_size = block_size        def __len__(self):        return len(self.data) - self.block_size        def __getitem__(self, idx):        # Get chunk of data        chunk = self.data[idx:idx + self.block_size + 1]        x = chunk[:-1]  # Input        y = chunk[1:]   # Target (shifted by 1)        return x, y# Create dataset and dataloaderblock_size = 64batch_size = 32dataset = TextDataset(encoded_data, block_size)dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)print(f"Dataset size: {len(dataset)}")print(f"Batch size: {batch_size}")print(f"Number of batches: {len(dataloader)}")# Test batchx, y = next(iter(dataloader))print(f"\nBatch input shape: {x.shape}")print(f"Batch target shape: {y.shape}")

## 4. Create ModelSmall GPT model for our dataset

In [None]:
# Import GPT class from Lesson 1# (In practice, you'd import from a separate file)from types import SimpleNamespace# Simple configconfig = SimpleNamespace(    vocab_size=tokenizer.vocab_size,    n_positions=block_size,    n_embd=128,    n_layer=4,    n_head=4,    dropout=0.1,)# We'll use the GPT class from Lesson 1# For this demo, let's create a simplified versionclass SimpleGPT(nn.Module):    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout=0.1):        super().__init__()        self.block_size = block_size                self.tok_emb = nn.Embedding(vocab_size, n_embd)        self.pos_emb = nn.Embedding(block_size, n_embd)        self.drop = nn.Dropout(dropout)                # Transformer blocks        self.blocks = nn.Sequential(*[            nn.TransformerEncoderLayer(                d_model=n_embd,                nhead=n_head,                dim_feedforward=4*n_embd,                dropout=dropout,                activation='gelu',                batch_first=True,            ) for _ in range(n_layer)        ])                self.ln_f = nn.LayerNorm(n_embd)        self.head = nn.Linear(n_embd, vocab_size)        def forward(self, idx, targets=None):        B, T = idx.shape                tok_emb = self.tok_emb(idx)        pos_emb = self.pos_emb(torch.arange(T, device=idx.device))        x = self.drop(tok_emb + pos_emb)                # Create causal mask        mask = torch.triu(torch.ones(T, T, device=idx.device), diagonal=1).bool()                x = self.blocks(x, src_mask=mask, is_causal=True)        x = self.ln_f(x)        logits = self.head(x)                loss = None        if targets is not None:            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))                return logits, loss# Create modeldevice = 'cuda' if torch.cuda.is_available() else 'cpu'model = SimpleGPT(    vocab_size=tokenizer.vocab_size,    n_embd=config.n_embd,    n_head=config.n_head,    n_layer=config.n_layer,    block_size=block_size,    dropout=config.dropout).to(device)n_params = sum(p.numel() for p in model.parameters())print(f"Model created with {n_params:,} parameters")print(f"Device: {device}")

## 5. Training LoopTrain the model!

In [None]:
# Training configurationlearning_rate = 3e-4num_epochs = 10# Optimizeroptimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)# Training looplosses = []model.train()print("Training...\n")for epoch in range(num_epochs):    epoch_loss = 0        for batch_idx, (x, y) in enumerate(dataloader):        x, y = x.to(device), y.to(device)                # Forward pass        logits, loss = model(x, y)                # Backward pass        optimizer.zero_grad()        loss.backward()        optimizer.step()                epoch_loss += loss.item()        avg_loss = epoch_loss / len(dataloader)    losses.append(avg_loss)        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")# Plot training lossplt.figure(figsize=(10, 5))plt.plot(losses, marker='o')plt.title('Training Loss')plt.xlabel('Epoch')plt.ylabel('Loss')plt.grid(True)plt.show()print("\n✅ Training complete!")

## 6. Text GenerationGenerate text from the trained model!

In [None]:
@torch.no_grad()def generate(model, tokenizer, prompt, max_new_tokens=100, temperature=1.0):    model.eval()        # Encode prompt    idx = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long).to(device)        # Generate    for _ in range(max_new_tokens):        # Crop to block size        idx_cond = idx if idx.size(1) <= model.block_size else idx[:, -model.block_size:]                # Forward pass        logits, _ = model(idx_cond)        logits = logits[:, -1, :] / temperature                # Sample        probs = F.softmax(logits, dim=-1)        idx_next = torch.multinomial(probs, num_samples=1)                # Append        idx = torch.cat([idx, idx_next], dim=1)        # Decode    return tokenizer.decode(idx[0].tolist())# Generate textprompts = [    "Artificial intelligence",    "Machine learning",    "Neural networks",]print("\nGenerated text:\n")print("=" * 60)for prompt in prompts:    generated = generate(model, tokenizer, prompt, max_new_tokens=150, temperature=0.8)    print(f"Prompt: '{prompt}'")    print(f"Generated:\n{generated}\n")    print("-" * 60)print("\n✅ Generation complete!")

## 7. Save and Load ModelSave your trained model!

In [None]:
# Save checkpointcheckpoint = {    'model_state_dict': model.state_dict(),    'optimizer_state_dict': optimizer.state_dict(),    'config': config,    'tokenizer_vocab': tokenizer.char_to_idx,    'losses': losses,}torch.save(checkpoint, 'gpt_checkpoint.pt')print("✅ Model saved to 'gpt_checkpoint.pt'")# Load checkpointdef load_model(checkpoint_path):    checkpoint = torch.load(checkpoint_path)        # Recreate model    model = SimpleGPT(        vocab_size=len(checkpoint['tokenizer_vocab']),        n_embd=checkpoint['config'].n_embd,        n_head=checkpoint['config'].n_head,        n_layer=checkpoint['config'].n_layer,        block_size=checkpoint['config'].n_positions,        dropout=checkpoint['config'].dropout    )        model.load_state_dict(checkpoint['model_state_dict'])    return model, checkpointprint("\nTo load:")print("model, checkpoint = load_model('gpt_checkpoint.pt')")

## Summary### What We Did:1. **Prepared data** - Text to tokens2. **Created dataset** - PyTorch DataLoader3. **Trained GPT** - Full training loop4. **Generated text** - Autoregressive sampling5. **Saved model** - Checkpointing### Key Insights:- More data = better results- Larger models need more compute- Temperature controls creativity- Regular checkpointing is essential### Next Steps:👉 **Lesson 3**: Build a chat interface with Gradio!### Improvements to Try:- Train on larger text corpus- Increase model size- Add learning rate scheduling- Use gradient accumulation- Implement early stoppingYou trained your own GPT! 🚀