# Phase 2, Lesson 2: Sentiment Analysis with Embeddings[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/suraaj3poudel/Learn-To-Make-GPT-Model/blob/main/phase2_text_and_embeddings/02_sentiment_analysis.ipynb)Build your first NLP classifier! 🎭## What You'll Learn1. Sentiment analysis task2. Using embeddings for classification3. Building a neural network text classifier4. Evaluating your modelLet's build something useful!

In [None]:
# Setupimport numpy as npimport matplotlib.pyplot as pltfrom collections import Counterimport reprint('✅ Ready to build a sentiment analyzer!')

## 1. The Task: Sentiment Analysis**Goal**: Classify text as POSITIVE or NEGATIVEExamples:- "I love this movie!" → POSITIVE ✅- "This is terrible" → NEGATIVE ❌- "Best day ever!" → POSITIVE ✅This is a **classification** problem!

In [None]:
# Simple sentiment datasetreviews = [    ("I love this product", "positive"),    ("This is amazing", "positive"),    ("Best purchase ever", "positive"),    ("Absolutely wonderful", "positive"),    ("Great quality", "positive"),    ("Terrible experience", "negative"),    ("Waste of money", "negative"),    ("Very disappointed", "negative"),    ("Poor quality", "negative"),    ("Do not buy this", "negative"),    ("Pretty good", "positive"),    ("Not bad", "positive"),    ("Could be better", "negative"),    ("Not worth it", "negative"),]print(f"Dataset size: {len(reviews)} reviews")print("\nSample reviews:")for review, sentiment in reviews[:5]:    print(f"  '{review}' → {sentiment}")

## 2. Prepare Data: Tokenization & VocabularySame as before - create vocabulary and encode text.

In [None]:
# Build vocabulary from all reviewsall_words = []for review, _ in reviews:    words = re.findall(r'\w+', review.lower())    all_words.extend(words)# Create vocabularyvocab = {word: i for i, word in enumerate(sorted(set(all_words)))}vocab_size = len(vocab)# Add special tokensvocab['<UNK>'] = len(vocab)  # Unknown wordsprint(f"Vocabulary size: {vocab_size + 1}")print(f"\nVocabulary: {list(vocab.keys())[:15]}...")# Encode reviewsdef encode_review(review, vocab):    words = re.findall(r'\w+', review.lower())    return [vocab.get(word, vocab['<UNK>']) for word in words]# Test encodingtest_review = "I love this product"encoded = encode_review(test_review, vocab)print(f"\nEncoded '{test_review}': {encoded}")

## 3. Create EmbeddingsEach word gets a learned vector representation.

In [None]:
# Initialize embeddingsembedding_dim = 10  # Small for our tiny datasetnp.random.seed(42)# Embedding matrix: each word → vectorembeddings = np.random.randn(len(vocab), embedding_dim) * 0.1print(f"Embedding matrix shape: {embeddings.shape}")print(f"Each word is represented by a {embedding_dim}-dimensional vector")# Example: Get embedding for a wordword = 'love'if word in vocab:    word_idx = vocab[word]    word_vector = embeddings[word_idx]    print(f"\n'{word}' embedding: {word_vector}")

## 4. Build the ClassifierSimple architecture:1. **Embedding layer**: Words → Vectors2. **Averaging**: Average all word vectors3. **Dense layer**: Make prediction

In [None]:
class SentimentClassifier:    def __init__(self, vocab_size, embedding_dim):        # Embedding layer        self.embeddings = np.random.randn(vocab_size, embedding_dim) * 0.1                # Classification layer        self.W = np.random.randn(embedding_dim, 1) * 0.1        self.b = np.zeros(1)        def sigmoid(self, x):        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))        def forward(self, word_indices):        # Get embeddings for all words        word_vecs = self.embeddings[word_indices]                # Average word vectors (simple but effective!)        avg_vec = np.mean(word_vecs, axis=0)                # Classification        logit = np.dot(avg_vec, self.W) + self.b        prob = self.sigmoid(logit)                return prob[0], avg_vec        def predict(self, word_indices):        prob, _ = self.forward(word_indices)        return "positive" if prob > 0.5 else "negative"# Create modelmodel = SentimentClassifier(len(vocab), embedding_dim)# Test prediction (before training)test_indices = encode_review("I love this", vocab)prediction = model.predict(test_indices)print(f"Prediction (before training): {prediction}")print("(Random guess - we haven't trained yet!)")

## 5. Training the ModelUse gradient descent to learn good embeddings and weights!

In [None]:
def train_model(model, reviews, epochs=200, lr=0.1):    losses = []        # Encode labels: positive=1, negative=0    encoded_reviews = []    labels = []    for review, sentiment in reviews:        indices = encode_review(review, vocab)        encoded_reviews.append(indices)        labels.append(1.0 if sentiment == "positive" else 0.0)        print("Training...")    for epoch in range(epochs):        epoch_loss = 0                for indices, label in zip(encoded_reviews, labels):            # Forward pass            prob, avg_vec = model.forward(indices)                        # Compute loss (binary cross-entropy)            loss = -label * np.log(prob + 1e-10) - (1-label) * np.log(1-prob + 1e-10)            epoch_loss += loss                        # Backward pass (simplified)            error = prob - label                        # Update weights            model.W -= lr * np.outer(avg_vec, error)            model.b -= lr * error                        # Update embeddings (simplified)            grad_embed = np.outer(model.W, error).T / len(indices)            for idx in indices:                model.embeddings[idx] -= lr * grad_embed                losses.append(epoch_loss / len(reviews))                if (epoch + 1) % 40 == 0:            print(f"Epoch {epoch+1}/{epochs}, Loss: {losses[-1]:.4f}")        return losses# Train!losses = train_model(model, reviews, epochs=200, lr=0.1)# Plot lossplt.figure(figsize=(10, 4))plt.plot(losses)plt.title('Training Loss')plt.xlabel('Epoch')plt.ylabel('Loss')plt.grid(True)plt.show()print("\n✅ Training complete!")

## 6. Test the Trained ModelLet's see how well it works!

In [None]:
# Test on training datacorrect = 0for review, true_label in reviews:    indices = encode_review(review, vocab)    pred_label = model.predict(indices)    is_correct = pred_label == true_label    correct += is_correct    print(f"'{review}' → Predicted: {pred_label}, True: {true_label} {'✅' if is_correct else '❌'}")accuracy = correct / len(reviews) * 100print(f"\nAccuracy: {accuracy:.1f}%")

## 7. Try Your Own Reviews!Test the model on new text!

In [None]:
# Test on new reviews (not in training data)new_reviews = [    "This is fantastic",    "Horrible product",    "Really great",    "Very bad",    "Absolutely love it",]print("Testing on new reviews:\n")for review in new_reviews:    indices = encode_review(review, vocab)    prediction = model.predict(indices)    prob, _ = model.forward(indices)    print(f"'{review}'")    print(f"  → {prediction.upper()} (confidence: {prob:.2%})\n")

## Summary### What We Built:1. **Sentiment classifier** using word embeddings2. **Embedding layer** that learns meaningful word vectors3. **Simple averaging** to combine word vectors4. **Binary classifier** for positive/negative### Key Insights:- Embeddings make words meaningful to neural networks- Averaging word vectors is simple but effective- The model learns which words indicate positive/negative### Next Steps:👉 **Lesson 3**: Learn about **attention** - a more sophisticated way to combine word vectors!Great work! You built your first NLP classifier! 🎉