# S07 - Word Embeddings & Neural Networks
## Exercises

### Exercise 1 (Easy)
Load pre-trained Word2Vec embeddings and find similar words.

In [9]:
import gensim.downloader as api

# Load pre-trained word2vec (google-news-300 or glove-wiki-gigaword-100)
model = api.load("glove-wiki-gigaword-100")

# Find top 5 most similar words to 'king'
similar_words = model.most_similar("king", topn=5)

print("Top 5 words similar to 'king':")
for word, score in similar_words:
    print(f"{word}: {score:.4f}")


Top 5 words similar to 'king':
prince: 0.7682
queen: 0.7508
son: 0.7021
brother: 0.6986
monarch: 0.6978


### Exercise 2 (Easy)
Perform word analogy: king - man + woman = ?

In [10]:
# Use the model to solve: king - man + woman = ?
# Also try: paris - france + spain = ?
import gensim.downloader as api

# Load a pre-trained model (same as Exercise 1)
model = api.load("glove-wiki-gigaword-100")

# Analogy: king - man + woman = ?
result1 = model.most_similar(positive=["king", "woman"],
                              negative=["man"],
                              topn=5)

print("king - man + woman:")
for word, score in result1:
    print(f"{word}: {score:.4f}")

# Another analogy: paris - france + spain = ?
result2 = model.most_similar(positive=["paris", "spain"],
                              negative=["france"],
                              topn=5)

print("\nparis - france + spain:")
for word, score in result2:
    print(f"{word}: {score:.4f}")

king - man + woman:
queen: 0.7699
monarch: 0.6843
throne: 0.6756
daughter: 0.6595
princess: 0.6521

paris - france + spain:
madrid: 0.8061
aires: 0.7141
buenos: 0.6975
prohertrib: 0.6854
rome: 0.6849


### Exercise 3 (Medium)
Train your own Word2Vec model on a custom corpus.

In [12]:
from gensim.models import Word2Vec

corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "ran", "in", "the", "park"],
    ["cats", "and", "dogs", "are", "pets"],
    ["the", "cat", "chased", "the", "dog"],
    ["pets", "need", "food", "and", "water"]
]

# Train Word2Vec model (vector_size=50, window=3, min_count=1)
model = Word2Vec(
    sentences=corpus,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4,
    sg=0  # 0 = CBOW, 1 = Skip-gram
)

# Get word vector
cat_vector = model.wv["cat"]
print("Vector size for 'cat':", len(cat_vector))

# Find similar words
similar_to_cat = model.wv.most_similar("cat", topn=3)
print("\nTop 3 words similar to 'cat':")
for word, score in similar_to_cat:
    print(f"{word}: {score:.4f}")

# Try another word
similar_to_dog = model.wv.most_similar("dog", topn=3)
print("\nTop 3 words similar to 'dog':")
for word, score in similar_to_dog:
    print(f"{word}: {score:.4f}")

Vector size for 'cat': 50

Top 3 words similar to 'cat':
and: 0.1656
in: 0.1538
need: 0.1366

Top 3 words similar to 'dog':
park: 0.1901
need: 0.0449
chased: -0.0101


### Exercise 4 (Medium)
Build a simple neural network for text classification using embeddings.

In [None]:
import torch
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        # Define: embedding layer, linear layer
        pass
    
    def forward(self, x):
        # Embed -> mean pooling -> classify
        pass

# Test with dummy data


### Exercise 5 (Hard)
Implement the Skip-gram model from scratch (forward pass only).

*Research: Skip-gram predicts context words given center word.*

In [None]:
import torch
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        # Two embedding matrices: center and context
        pass
    
    def forward(self, center, context):
        # Return dot product scores
        pass
