In [1]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

"""
In this section, we will look at how we can learn a word embedding while fitting a neural network on a 
text classification problem.

We will define a small problem where we have 10 text documents, each with a comment about a piece of work a 
student submitted. Each text document is classified as positive “1” or negative “0”. This is a simple sentiment 
analysis problem.

First, we will define the documents and their class labels.
"""

# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])



Using TensorFlow backend.


In [2]:
"""
Next, we can integer encode each document. This means that as input the Embedding layer will have sequences 
of integers. We could experiment with other more sophisticated bag of word model encoding like counts or TF-IDF.

Keras provides the one_hot() function that creates a hash of each word as an efficient integer encoding. 
We will estimate the vocabulary size of 50, which is much larger than needed to reduce the probability of 
collisions from the hash function.
"""

# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)


[[2, 42], [41, 15], [28, 45], [26, 15], [2], [24], [15, 45], [7, 41], [15, 15], [33, 21, 42, 34]]


In [3]:
"""
The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have 
the same length. We will pad all input sequences to have the length of 4. Again, we can do this with a built 
in Keras function, in this case the pad_sequences() function.
"""

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)


[[ 2 42  0  0]
 [41 15  0  0]
 [28 45  0  0]
 [26 15  0  0]
 [ 2  0  0  0]
 [24  0  0  0]
 [15 45  0  0]
 [ 7 41  0  0]
 [15 15  0  0]
 [33 21 42 34]]


In [4]:
"""
We are now ready to define our Embedding layer as part of our neural network model.

The Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding 
space of 8 dimensions.

The model is a simple binary classification model. Importantly, the output from the Embedding 
layer will be 4 vectors of 8 dimensions each, one for each word. We flatten this to a one 32-element 
vector to pass on to the Dense output layer.
"""

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [6]:

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))


Accuracy: 89.999998
