## Word Embedding Technique using Embedding Layer in Keras

In [1]:
## Tensorflow >2.0 and keras

In [3]:
from tensorflow.keras.preprocessing.text import one_hot

In [15]:
## sentences

sent = [
    'I am a good boy',
    'I am a bad boy',
    'You are good girl',
    'I like mango',
    'I am from Kolkata',
    'I love technology'
]

In [16]:
sent

['I am a good boy',
 'I am a bad boy',
 'You are good girl',
 'I like mango',
 'I am from Kolkata',
 'I love technology']

In [17]:
## Vocabulary size

voc_size = 10000

## One Hot representation 

In [18]:
## One Hot representation
onehot_repr = [one_hot(words , voc_size) for words in sent]
onehot_repr

[[9792, 9209, 8358, 2812, 4624],
 [9792, 9209, 8358, 3386, 4624],
 [1405, 9530, 2812, 2605],
 [9792, 1977, 2030],
 [9792, 9209, 2950, 462],
 [9792, 7857, 264]]

## Word Embedding Representation

In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences  ## same number of words in every sentence
from tensorflow.keras.models import Sequential
import numpy as np 

In [25]:
'''
Padding is import steps for word embedding representation otherwise 
index value will be discard . Here we are doing pre padding.
We can do post padding as well

''' 
sent_len = 10
embedding = pad_sequences(onehot_repr,padding='pre',maxlen=sent_len)
embedding

array([[   0,    0,    0,    0,    0, 9792, 9209, 8358, 2812, 4624],
       [   0,    0,    0,    0,    0, 9792, 9209, 8358, 3386, 4624],
       [   0,    0,    0,    0,    0,    0, 1405, 9530, 2812, 2605],
       [   0,    0,    0,    0,    0,    0,    0, 9792, 1977, 2030],
       [   0,    0,    0,    0,    0,    0, 9792, 9209, 2950,  462],
       [   0,    0,    0,    0,    0,    0,    0, 9792, 7857,  264]])

In [27]:
## Define Dimention 
dim = 10

In [28]:
model = Sequential()
model.add(Embedding(voc_size,10,input_length=sent_len))
model.compile('adam','mse')

## adam - adam optimizer
## mse - mean squared error

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 10)            100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.predict(embedding)

array([[[ 0.00465871,  0.04092607, -0.01598113, -0.02895813,
         -0.04308413, -0.03094381, -0.01282632, -0.04316006,
         -0.00367897, -0.03783653],
        [ 0.00465871,  0.04092607, -0.01598113, -0.02895813,
         -0.04308413, -0.03094381, -0.01282632, -0.04316006,
         -0.00367897, -0.03783653],
        [ 0.00465871,  0.04092607, -0.01598113, -0.02895813,
         -0.04308413, -0.03094381, -0.01282632, -0.04316006,
         -0.00367897, -0.03783653],
        [ 0.00465871,  0.04092607, -0.01598113, -0.02895813,
         -0.04308413, -0.03094381, -0.01282632, -0.04316006,
         -0.00367897, -0.03783653],
        [ 0.00465871,  0.04092607, -0.01598113, -0.02895813,
         -0.04308413, -0.03094381, -0.01282632, -0.04316006,
         -0.00367897, -0.03783653],
        [-0.04364819,  0.03255229, -0.01496198, -0.02031417,
         -0.03967492,  0.01066654, -0.02896609, -0.01036136,
          0.00958236,  0.03976781],
        [ 0.03196018,  0.00019069,  0.03539642,  0.0

In [31]:
embedding[1]

array([   0,    0,    0,    0,    0, 9792, 9209, 8358, 3386, 4624])

In [32]:
model.predict(embedding)[1]

array([[ 0.00465871,  0.04092607, -0.01598113, -0.02895813, -0.04308413,
        -0.03094381, -0.01282632, -0.04316006, -0.00367897, -0.03783653],
       [ 0.00465871,  0.04092607, -0.01598113, -0.02895813, -0.04308413,
        -0.03094381, -0.01282632, -0.04316006, -0.00367897, -0.03783653],
       [ 0.00465871,  0.04092607, -0.01598113, -0.02895813, -0.04308413,
        -0.03094381, -0.01282632, -0.04316006, -0.00367897, -0.03783653],
       [ 0.00465871,  0.04092607, -0.01598113, -0.02895813, -0.04308413,
        -0.03094381, -0.01282632, -0.04316006, -0.00367897, -0.03783653],
       [ 0.00465871,  0.04092607, -0.01598113, -0.02895813, -0.04308413,
        -0.03094381, -0.01282632, -0.04316006, -0.00367897, -0.03783653],
       [-0.04364819,  0.03255229, -0.01496198, -0.02031417, -0.03967492,
         0.01066654, -0.02896609, -0.01036136,  0.00958236,  0.03976781],
       [ 0.03196018,  0.00019069,  0.03539642,  0.04949525,  0.0448291 ,
        -0.02930281, -0.00356598,  0.04215368