## Word Embedding using Embedding Layer in Keras

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
#sentences
sentences = ["the glass of milk",
        "the glass of juice",
        "the cup of tea",
        "I am a good boy",
        "I am a good developer",
        "understand the meaning of words",
        "your videos are good"]

In [3]:
#vocabulary_size
voc_size=10000

### One-Hot Representation

In [4]:
onehot_repr = [one_hot(sentence, voc_size) for sentence in sentences]
print(onehot_repr)

[[2756, 1860, 2135, 8876], [2756, 1860, 2135, 7730], [2756, 8439, 2135, 5413], [6156, 3145, 653, 316, 5456], [6156, 3145, 653, 316, 1469], [2403, 2756, 8972, 2135, 3205], [9315, 927, 2705, 316]]


### Word Embedding representation

In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [6]:
import numpy as np

In [7]:
sent_length = 8
embedded_docs = pad_sequences(onehot_repr, padding="pre", maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 2756 1860 2135 8876]
 [   0    0    0    0 2756 1860 2135 7730]
 [   0    0    0    0 2756 8439 2135 5413]
 [   0    0    0 6156 3145  653  316 5456]
 [   0    0    0 6156 3145  653  316 1469]
 [   0    0    0 2403 2756 8972 2135 3205]
 [   0    0    0    0 9315  927 2705  316]]


In [8]:
# number of features each word is expressed into
dim = 15

In [9]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile(optimizer="adam", loss="mse")

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 15)             150000    
Total params: 150,000
Trainable params: 150,000
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.predict(embedded_docs)

array([[[ 4.27546017e-02,  5.76422364e-03, -1.18412003e-02,
         -3.68241668e-02, -4.07806151e-02, -1.01029277e-02,
         -7.42722303e-04, -3.99689451e-02, -5.76803833e-03,
         -2.86594629e-02,  4.28852700e-02, -8.90409946e-03,
          2.92495638e-03,  3.13137285e-02, -5.15395403e-03],
        [ 4.27546017e-02,  5.76422364e-03, -1.18412003e-02,
         -3.68241668e-02, -4.07806151e-02, -1.01029277e-02,
         -7.42722303e-04, -3.99689451e-02, -5.76803833e-03,
         -2.86594629e-02,  4.28852700e-02, -8.90409946e-03,
          2.92495638e-03,  3.13137285e-02, -5.15395403e-03],
        [ 4.27546017e-02,  5.76422364e-03, -1.18412003e-02,
         -3.68241668e-02, -4.07806151e-02, -1.01029277e-02,
         -7.42722303e-04, -3.99689451e-02, -5.76803833e-03,
         -2.86594629e-02,  4.28852700e-02, -8.90409946e-03,
          2.92495638e-03,  3.13137285e-02, -5.15395403e-03],
        [ 4.27546017e-02,  5.76422364e-03, -1.18412003e-02,
         -3.68241668e-02, -4.07806151

In [12]:
# first sentence
embedded_docs[0]

array([   0,    0,    0,    0, 2756, 1860, 2135, 8876])

In [13]:
# each word of first sentence will be represented in the vector
# form having number of user defined features
model.predict(embedded_docs)[0]

array([[ 0.0427546 ,  0.00576422, -0.0118412 , -0.03682417, -0.04078062,
        -0.01010293, -0.00074272, -0.03996895, -0.00576804, -0.02865946,
         0.04288527, -0.0089041 ,  0.00292496,  0.03131373, -0.00515395],
       [ 0.0427546 ,  0.00576422, -0.0118412 , -0.03682417, -0.04078062,
        -0.01010293, -0.00074272, -0.03996895, -0.00576804, -0.02865946,
         0.04288527, -0.0089041 ,  0.00292496,  0.03131373, -0.00515395],
       [ 0.0427546 ,  0.00576422, -0.0118412 , -0.03682417, -0.04078062,
        -0.01010293, -0.00074272, -0.03996895, -0.00576804, -0.02865946,
         0.04288527, -0.0089041 ,  0.00292496,  0.03131373, -0.00515395],
       [ 0.0427546 ,  0.00576422, -0.0118412 , -0.03682417, -0.04078062,
        -0.01010293, -0.00074272, -0.03996895, -0.00576804, -0.02865946,
         0.04288527, -0.0089041 ,  0.00292496,  0.03131373, -0.00515395],
       [-0.00861138, -0.01501808,  0.02713568,  0.00891563, -0.03758206,
         0.03794751,  0.0037056 , -0.02273541, 