In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
texts=[
    "This is the first document. ",
    "This document is the second document. ",
    "And this is the third one. ",
    "Is this the first document?",
]

##### tokenize the text:split your text into individual words or tokens.
##### create a vocabulary: build a vocabulary mapping each unique words/tokens to an integer index.
##### covert text to a sequence:replace each word /token in the text with its corresponding integer index based on the vocabulary.

In [3]:
#step1:tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index= tokenizer.word_index

In [4]:
word_index

{'this': 1,
 'is': 2,
 'the': 3,
 'document': 4,
 'first': 5,
 'second': 6,
 'and': 7,
 'third': 8,
 'one': 9}

#step2:convert text to sequences based on token number.tokenize the text using Tokenizer and convert it into sequences of integers,
#pad the sequences to ensure they all have the same length. then create an embedding matrix using Embedding layer,
#where each word index in the sequences is mapped to a dense vector representation.


In [14]:
sequences = tokenizer.texts_to_sequences(texts)

In [15]:
sequences

[[1, 2, 3, 5, 4], [1, 4, 2, 3, 6, 4], [7, 1, 2, 3, 8, 9], [2, 1, 3, 5, 4]]

#then pad the sequencs to ensure they all have the same length.
#then create an embedding matrix using embedding layer,where each word index in the sequence is mapped to a dense vector representations.

In [16]:
#step3: pad sequences(optional)
#Ensure all sequences have the same length by padding them with zeros or truncating them.

max_sequence_length = max([len(seq) for seq in sequences])

sequences_padded = pad_sequences(sequences, max_sequence_length,padding='post')

In [17]:
sequences_padded

array([[1, 2, 3, 5, 4, 0],
       [1, 4, 2, 3, 6, 4],
       [7, 1, 2, 3, 8, 9],
       [2, 1, 3, 5, 4, 0]])

In [18]:
#step 4: Apply Embeddeding layer
vocab_size = len(word_index) + 1 #Add 1 for the padding token
embedding_dim = 10 #Dimensionality of the dense embedding
embedding_matrix = tf.keras.layers.Embedding(vocab_size,embedding_dim)(sequences_padded)

In [19]:
vocab_size

10

In [20]:
embedding_matrix

<tf.Tensor: shape=(4, 6, 10), dtype=float32, numpy=
array([[[ 0.00567038,  0.02052544, -0.02147003, -0.00859245,
         -0.02565751,  0.00093352, -0.0476921 ,  0.03640959,
         -0.02409015,  0.04215823],
        [-0.03412654,  0.02689386, -0.02375519,  0.00173729,
         -0.03407858, -0.0347609 , -0.03391661, -0.02012723,
          0.03678167, -0.017207  ],
        [ 0.04921539,  0.04874707, -0.03589689, -0.02438277,
         -0.01193593,  0.03276572, -0.01988528,  0.00199729,
          0.04116702, -0.01148437],
        [-0.04121898, -0.04724805, -0.03039035,  0.00681549,
         -0.01322696,  0.04899949,  0.02516169, -0.04047086,
         -0.04650396, -0.01550307],
        [ 0.04312699, -0.02032094,  0.04697779, -0.04523547,
          0.00775957, -0.0157824 ,  0.00147123,  0.01459128,
          0.04694228,  0.01379362],
        [ 0.01754178, -0.03100899,  0.00863298, -0.00800971,
          0.03541431, -0.0290244 , -0.01782984, -0.00624729,
          0.04921286, -0.0074969 ]],

In [21]:
#print the embedding matrix shape
print(embedding_matrix.shape)  #Output: (num_samples , max_sequence_length,embedding_dim)

(4, 6, 10)


Dimension 1 (4): This dimension corresponds to the number of samples in your input data. In this case, you have 4 samples (or sentences/documents). Dimension 2 (6): This dimension represents the length of the sequences after padding. Since you've padded the sequences to a length of 6, this dimension is Dimension 3 (100): This dimension is the dimensionality of the dense embedding vectors. You've chosen an embedding dimension of 100, so each word is represented by a dense vector of length 100.