# Phase 3, Lesson 2: Building Mini Transformer[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/suraaj3poudel/Learn-To-Make-GPT-Model/blob/main/phase3_mini_transformer/02_building_mini_transformer.ipynb)Train your Transformer for text generation! 🚀## What You'll Learn1. Prepare text data for transformers2. Training a transformer3. Text generation with sampling4. Evaluation and improvementLet's make it generate text!

In [None]:
# Setupimport numpy as npimport matplotlib.pyplot as pltfrom collections import Counterimport reprint('✅ Ready to train a Transformer!')

## 1. Prepare Training DataWe'll train on a simple text corpus to generate similar text.

In [None]:
# Simple training corpuscorpus = """Machine learning is the study of algorithms that improve automatically through experience.Deep learning is a subset of machine learning based on artificial neural networks.Neural networks are computing systems inspired by biological neural networks.Transformers are a type of neural network architecture.Attention mechanisms allow models to focus on relevant parts of input.Natural language processing uses machine learning for text and speech.Language models predict the next word in a sequence.GPT is a transformer-based language model.Text generation creates human-like text output.Training requires large datasets and computational resources."""# Tokenizetokens = re.findall(r'\w+|\.', corpus.lower())print(f"Total tokens: {len(tokens)}")print(f"Sample tokens: {tokens[:20]}")# Build vocabularyvocab = {word: i for i, word in enumerate(sorted(set(tokens)))}vocab['<PAD>'] = len(vocab)vocab_size = len(vocab)reverse_vocab = {i: word for word, i in vocab.items()}print(f"\nVocabulary size: {vocab_size}")# Encode corpusencoded = [vocab[token] for token in tokens]print(f"\nEncoded length: {len(encoded)}")print(f"First 20 encoded: {encoded[:20]}")

## 2. Create Training ExamplesFor language modeling, we predict the next token at each position.

In [None]:
def create_training_data(encoded, seq_len):    """    Create (input, target) pairs for language modeling        Args:        encoded: List of token IDs        seq_len: Sequence length        Returns:        inputs, targets: Training pairs    """    inputs = []    targets = []        for i in range(len(encoded) - seq_len):        inputs.append(encoded[i:i+seq_len])        targets.append(encoded[i+1:i+seq_len+1])        return np.array(inputs), np.array(targets)# Create training dataseq_len = 10X_train, y_train = create_training_data(encoded, seq_len)print(f"Training examples: {len(X_train)}")print(f"\nExample:")print(f"Input:  {X_train[0]}")print(f"Target: {y_train[0]}")print(f"\nDecoded:")print(f"Input:  {' '.join([reverse_vocab[i] for i in X_train[0]])} ")print(f"Target: {' '.join([reverse_vocab[i] for i in y_train[0]])}")

## 3. Build Simplified TransformerUsing the architecture from Lesson 1, but simplified for training.

In [None]:
# Reuse classes from Lesson 1 (simplified versions)import mathdef positional_encoding(max_len, d_model):    pe = np.zeros((max_len, d_model))    for pos in range(max_len):        for i in range(0, d_model, 2):            pe[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))            if i + 1 < d_model:                pe[pos, i + 1] = math.cos(pos / (10000 ** (2 * i / d_model)))    return peclass SimpleTransformer:    def __init__(self, vocab_size, d_model, max_len):        self.vocab_size = vocab_size        self.d_model = d_model                # Embedding        self.embedding = np.random.randn(vocab_size, d_model) * 0.1        self.pos_encoding = positional_encoding(max_len, d_model)                # Single attention layer (simplified)        self.W_q = np.random.randn(d_model, d_model) * 0.1        self.W_k = np.random.randn(d_model, d_model) * 0.1        self.W_v = np.random.randn(d_model, d_model) * 0.1                # Output projection        self.W_out = np.random.randn(d_model, vocab_size) * 0.1        self.b_out = np.zeros(vocab_size)        def attention(self, x):        """Simplified self-attention"""        Q = np.dot(x, self.W_q)        K = np.dot(x, self.W_k)        V = np.dot(x, self.W_v)                # Attention scores        scores = np.dot(Q, K.T) / np.sqrt(self.d_model)                # Causal mask (can't attend to future)        mask = np.tril(np.ones((len(x), len(x))))        scores = scores * mask + (1 - mask) * -1e9                # Softmax        exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))        weights = exp_scores / exp_scores.sum(axis=-1, keepdims=True)                # Weighted sum        output = np.dot(weights, V)        return output        def forward(self, token_ids):        """Forward pass"""        # Embedding + positional encoding        x = self.embedding[token_ids] + self.pos_encoding[:len(token_ids)]                # Attention        x = self.attention(x)                # Output projection        logits = np.dot(x, self.W_out) + self.b_out        return logits        def softmax(self, x):        exp_x = np.exp(x - np.max(x))        return exp_x / exp_x.sum()# Create modelmodel = SimpleTransformer(vocab_size=vocab_size, d_model=32, max_len=20)# Testsample_input = X_train[0]logits = model.forward(sample_input)print(f"Input shape: {sample_input.shape}")print(f"Output logits shape: {logits.shape}")print("\n✅ Model created!")

## 4. Training LoopTrain the model to predict next tokens!

In [None]:
def train_transformer(model, X_train, y_train, epochs=50, lr=0.01):    """Simple training loop"""    losses = []        print("Training...")    for epoch in range(epochs):        epoch_loss = 0                for x, y in zip(X_train, y_train):            # Forward pass            logits = model.forward(x)                        # Compute loss (cross-entropy)            loss = 0            for i, target in enumerate(y):                probs = model.softmax(logits[i])                loss += -np.log(probs[target] + 1e-10)                        loss /= len(y)            epoch_loss += loss                        # Backward pass (simplified gradient descent)            for i, target in enumerate(y):                probs = model.softmax(logits[i])                grad = probs.copy()                grad[target] -= 1                                # Update output weights (simplified)                model.W_out -= lr * np.outer(logits[i], grad) / len(y)                model.b_out -= lr * grad / len(y)                avg_loss = epoch_loss / len(X_train)        losses.append(avg_loss)                if (epoch + 1) % 10 == 0:            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")        return losses# Train (limited epochs for demo)losses = train_transformer(model, X_train[:50], y_train[:50], epochs=30, lr=0.001)# Plotplt.figure(figsize=(10, 4))plt.plot(losses)plt.title('Training Loss')plt.xlabel('Epoch')plt.ylabel('Loss')plt.grid(True)plt.show()print("\n✅ Training complete!")

## 5. Text GenerationGenerate new text by sampling from the model!

In [None]:
def generate_text(model, start_tokens, max_new_tokens=20, temperature=1.0):    """    Generate text from the model        Args:        model: Trained transformer        start_tokens: List of starting token IDs        max_new_tokens: How many tokens to generate        temperature: Sampling temperature (higher = more random)    """    generated = start_tokens.copy()        for _ in range(max_new_tokens):        # Get predictions        logits = model.forward(np.array(generated[-10:]))  # Last 10 tokens                # Get next token prediction        next_logits = logits[-1] / temperature        probs = model.softmax(next_logits)                # Sample from distribution        next_token = np.random.choice(len(probs), p=probs)        generated.append(next_token)        return generated# Generate textstart_text = "machine learning"start_tokens = [vocab[w] for w in start_text.split() if w in vocab]print("Generating text...\n")print(f"Prompt: {start_text}")print("-" * 50)for temp in [0.5, 1.0, 1.5]:    generated_tokens = generate_text(model, start_tokens, max_new_tokens=15, temperature=temp)    generated_text = ' '.join([reverse_vocab[t] for t in generated_tokens])    print(f"\nTemperature {temp}:")    print(generated_text)print("\n✅ Text generation working!")print("(Quality will improve with more data and training)")

## Summary### What We Built:1. **Training data** from text corpus2. **Simplified Transformer** for generation3. **Training loop** with backpropagation4. **Text generation** with sampling5. **Temperature** control for creativity### Key Insights:- Language modeling = predict next token- Causal masking prevents looking ahead- Temperature controls randomness- More data + training = better results### Next Steps:👉 **Phase 4**: Build a full GPT model with modern techniques!You can now train transformers! 🚀