In [1]:
# Embedding

In [3]:
import torch
import torch.nn as nn

# Step 1: Define vocabulary
vocab = ['<pad>', '<unk>', 'i', 'love', 'nlp', 'hello', 'world', 'machine', 'learning']
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Step 2: Define embedding layer
vocab_size = len(vocab)
embedding_dim = 8

embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Step 3: Tokenizer
def tokenize(sentence):
    return sentence.lower().split()

def sentence_to_indices(sentence):
    tokens = tokenize(sentence)
    indices = [word_to_idx.get(token, word_to_idx['<unk>']) for token in tokens]
    return torch.tensor(indices, dtype=torch.long)

# Step 4: Embed any input sentence
def embed_sentence(sentence):
    indices = sentence_to_indices(sentence)
    embeddings = embedding_layer(indices)
    return embeddings

# === 🚀 TEST ===
input_sentence = "i am vedh"
embedded = embed_sentence(input_sentence)

print(f"\nInput sentence: '{input_sentence}'")
print("Word indices:", sentence_to_indices(input_sentence).tolist())
print("Embedding shape:", embedded.shape)
print("Embeddings:\n", embedded)



Input sentence: 'i am vedh'
Word indices: [2, 1, 1]
Embedding shape: torch.Size([3, 8])
Embeddings:
 tensor([[-1.2928, -0.1193,  0.1560, -1.1775, -0.2485, -0.1636,  0.7119,  0.6143],
        [-1.0409,  0.6119,  0.5584, -2.1784,  0.7289,  0.7069,  1.1904,  0.6467],
        [-1.0409,  0.6119,  0.5584, -2.1784,  0.7289,  0.7069,  1.1904,  0.6467]],
       grad_fn=<EmbeddingBackward0>)
