<a href="https://colab.research.google.com/github/twishi03/Graph/blob/main/Embedding_positional_encoding_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Embeddings capture the meaning of words
#Positional encoding adds position info to embeddings
#Combining them gives context-aware representations

In [1]:
!pip install torch



In [2]:
import torch
import torch.nn as nn

In [None]:
#Define the Vocabulary and Embedding Layer
#Next, we will define the vocabulary size and the embedding dimension, and create the embedding layer:

In [3]:
# Define the vocabulary size and embedding dimension
vocab_size = 10  # For simplicity, we will use a small vocabulary
embedding_dim = 4  # Size of each embedding vector

# Create the embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)

In [None]:
#Vocabulary Size: We set a small vocabulary size (10) for this example. In practice, this would be the total number of unique tokens in your dataset.
#Embedding Dimension: We choose an embedding dimension of 4, meaning each token will be represented by a 4-dimensional vector.
#Embedding Layer: We create an instance of nn.Embedding, which initializes the embedding weights randomly.

In [None]:
#Tokenization and Mapping to IDs
# we will tokenize the input sentence and map the tokens to unique IDs. For simplicity, we will manually define the mapping:

In [4]:
# Example input sentence
sentence = "The cat sat on the mat."

# Manual tokenization and mapping to IDs
tokens = ["The", "cat", "sat", "on", "the", "mat"]
token_to_id = {token: idx for idx, token in enumerate(tokens)}
print("Token to ID mapping:", token_to_id)

# Convert tokens to IDs
token_ids = [token_to_id["The"], token_to_id["cat"], token_to_id["sat"],
             token_to_id["on"], token_to_id["the"], token_to_id["mat"]]
print("Token IDs:", token_ids)

Token to ID mapping: {'The': 0, 'cat': 1, 'sat': 2, 'on': 3, 'the': 4, 'mat': 5}
Token IDs: [0, 1, 2, 3, 4, 5]


In [None]:
#Convert Token IDs to Embeddings
# Now, we will convert the token IDs into their corresponding embeddings using the embedding layer:

In [5]:
# Convert token IDs to a tensor
input_tensor = torch.tensor(token_ids)

# Get the embeddings for the input tokens
embeddings = embedding_layer(input_tensor)
print("Embeddings:\n", embeddings)

Embeddings:
 tensor([[-0.1895, -0.7090, -0.4066, -0.1553],
        [-0.5835, -0.4718,  0.3449, -0.0829],
        [-1.9861,  0.6218,  0.3677,  0.6417],
        [ 0.3010, -0.0173,  1.2684, -0.6471],
        [-0.9190,  1.3044,  0.4334, -0.4524],
        [ 0.7899,  0.0160, -1.3633, -1.1432]], grad_fn=<EmbeddingBackward0>)


In [None]:
#Print Words with Their Corresponding Embeddings
# After retrieving the embeddings, you can use the token_to_id mapping to print each token alongside its embedding vector. Here’s the additional code to achieve that:

In [6]:
# Print the tokens with their corresponding embeddings
print("Tokens with their Embeddings:")
for token, token_id in token_to_id.items():
    print(f"{token}: {embeddings[token_id].detach().numpy()}")

Tokens with their Embeddings:
The: [-0.18949166 -0.7090116  -0.4065933  -0.15532453]
cat: [-0.5835363  -0.47175115  0.34488687 -0.08287223]
sat: [-1.9861022   0.6218238   0.36767727  0.6417273 ]
on: [ 0.3010073  -0.01734046  1.268361   -0.64714956]
the: [-0.9189837   1.3043665   0.43335357 -0.45244074]
mat: [ 0.78989875  0.01604908 -1.3633478  -1.1431677 ]


In [None]:
#Define Positional Encoding Function
# Next, we will create a function to generate positional encodings based on the formulas we discussed earlier.

In [7]:
def positional_encoding(seq_len, d_model):
    # Create a tensor for positional encodings
    pos_enc = torch.zeros(seq_len, d_model)

    # Calculate positional encodings
    positions = torch.arange(seq_len, dtype=torch.float).unsqueeze(1)  # Shape (seq_len, 1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))  # Shape (d_model/2,)

    pos_enc[:, 0::2] = torch.sin(positions * div_term)  # Apply sine to even indices
    pos_enc[:, 1::2] = torch.cos(positions * div_term)  # Apply cosine to odd indices

    return pos_enc

In [None]:
# Generate Positional Encodings
# Now, we will generate the positional encodings for our input sentence.

In [8]:
# Define the sequence length (number of tokens) and embedding dimension
seq_len = len(tokens)  # Number of tokens in the input sentence
d_model = embedding_dim  # Use the same embedding dimension as before

# Generate positional encodings
positional_encodings = positional_encoding(seq_len, d_model)
print("Positional Encodings:\n", positional_encodings)

Positional Encodings:
 tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992],
        [-0.9589,  0.2837,  0.0500,  0.9988]])


In [None]:
# Adding Positional Encodings to Input Embeddings
# Finally, we will add the positional encodings to the input embeddings we created earlier.

In [9]:
encoded_embeddings = embeddings + positional_encodings
print("Encoded Embeddings (with Positional Encoding):\n", encoded_embeddings)

Encoded Embeddings (with Positional Encoding):
 tensor([[-0.1895,  0.2910, -0.4066,  0.8447],
        [ 0.2579,  0.0686,  0.3549,  0.9171],
        [-1.0768,  0.2057,  0.3877,  1.6415],
        [ 0.4421, -1.0073,  1.2984,  0.3524],
        [-1.6758,  0.6507,  0.4733,  0.5468],
        [-0.1690,  0.2997, -1.3134, -0.1444]], grad_fn=<AddBackward0>)
