<a href="https://colab.research.google.com/github/xuziyue/tensorflow-models/blob/main/3_2_word_embedding_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

2.7.0


In [2]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
# integer encode the documents
vocab_size = 50
encoded_docs = [tf.keras.preprocessing.text.one_hot(d, vocab_size) for d in docs]
# print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# print(padded_docs)
# define the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 8, input_length=max_length))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 8)              400       
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [3]:
# word embedding
model.layers[0].get_weights()[0].shape

(50, 8)

In [4]:
# embedding layer output
model.layers[0](padded_docs).shape

TensorShape([10, 4, 8])

In [5]:
print(encoded_docs[0])
print(padded_docs[0])
# 每个句子4个单词，每个单词 8-dim Vector存，所以用 (4, 8) Tensor 来encode一句话
print(model.layers[0](padded_docs)[0].shape)
print(model.layers[0](padded_docs)[0])

[19, 42]
[19 42  0  0]
(4, 8)
tf.Tensor(
[[ 0.01160971  0.01026126  0.01112379  0.03022653 -0.02167839 -0.00831026
   0.03004697 -0.00839169]
 [-0.02659056 -0.00979657  0.04716429 -0.00259627  0.02053921  0.04165914
   0.01037589  0.0260329 ]
 [ 0.02251155  0.04319591  0.02293619 -0.04363152 -0.00842453  0.00126964
   0.04676828 -0.04017727]
 [ 0.02251155  0.04319591  0.02293619 -0.04363152 -0.00842453  0.00126964
   0.04676828 -0.04017727]], shape=(4, 8), dtype=float32)


In [6]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 80.000001
