<a href="https://colab.research.google.com/github/xuziyue/tensorflow-models/blob/main/3_2_word_embedding_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###A Great Example

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

2.7.0


In [None]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
# integer encode the documents
vocab_size = 50
encoded_docs = [tf.keras.preprocessing.text.one_hot(d, vocab_size) for d in docs]
# print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# print(padded_docs)
# define the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 8, input_length=max_length))
model.add(tf.keras.layers.GlobalAveragePooling1D())
# model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 8)              400       
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 409
Trainable params: 409
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# word embedding
model.layers[0].get_weights()[0].shape

(50, 8)

In [None]:
# embedding layer output
model.layers[0](padded_docs).shape

TensorShape([10, 4, 8])

In [None]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 80.000001


In [None]:
print(encoded_docs[0])
print(padded_docs[0])
# 每个句子4个单词，每个单词 8-dim Vector存，所以用 (4, 8) Tensor 来encode一句话
print(model.layers[0](padded_docs)[0].shape)
print(model.layers[0](padded_docs)[0])

[29, 4]
[29  4  0  0]
(4, 8)
tf.Tensor(
[[-0.02121876  0.04471957  0.09124632  0.0430741   0.09961157 -0.02272778
  -0.09184757 -0.02938485]
 [-0.00388729  0.06070412  0.08556236  0.08647757  0.01273149 -0.00050254
   0.01930864 -0.06581032]
 [-0.08058061  0.03679593  0.038486    0.00696972  0.06237729 -0.04840755
  -0.0351923  -0.08223671]
 [-0.08058061  0.03679593  0.038486    0.00696972  0.06237729 -0.04840755
  -0.0351923  -0.08223671]], shape=(4, 8), dtype=float32)


In [None]:
# word embedding
model.layers[0].get_weights()[0]

array([[-8.05806145e-02,  3.67959291e-02,  3.84860039e-02,
         6.96972385e-03,  6.23772852e-02, -4.84075472e-02,
        -3.51922959e-02, -8.22367147e-02],
       [ 4.07399796e-02, -2.88557764e-02,  4.80941683e-03,
        -4.29144502e-02,  2.48646997e-02, -3.98334116e-03,
         3.11442651e-02,  9.40827280e-03],
       [ 2.41012238e-02,  3.56241800e-02, -1.62905231e-02,
        -1.51143670e-02,  3.66087295e-02,  1.55284069e-02,
        -3.89590152e-02,  1.68357156e-02],
       [ 1.49687380e-03, -3.42687257e-02, -9.55693796e-03,
        -3.09592616e-02, -6.61833212e-03,  4.71772067e-02,
         1.94991492e-02, -9.23343748e-03],
       [-3.88728618e-03,  6.07041195e-02,  8.55623633e-02,
         8.64775702e-02,  1.27314851e-02, -5.02541079e-04,
         1.93086397e-02, -6.58103153e-02],
       [-1.75579675e-02, -2.27724202e-02,  2.26706378e-02,
         1.43569969e-02, -4.84803692e-02, -3.42891365e-03,
         5.62459230e-03,  3.24455760e-02],
       [-9.21002775e-02,  4.216606

In [None]:
# Flatten layer shape
model.layers[1](model.layers[0](padded_docs)).shape

TensorShape([10, 8])

In [None]:
# Flatten layer values
model.layers[1](model.layers[0](padded_docs))[0]

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-0.04656682,  0.04475389,  0.06344517,  0.03587278,  0.05927441,
       -0.03001135, -0.03573088, -0.06491715], dtype=float32)>