In [26]:
import numpy as np

In [39]:
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Flatten, Input
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder

In [28]:
docs = ['king is man',
       'a king marrys queen',
       'queen marrys king',
       'some unrelated works',
       'queen is woman']

In [29]:
vocab_size = 50

`keras.preprocessing.text.one_hot` does not do one hot encoding. It actually transform a list of words into a list of integer numbers. It converts each word into an integer number that does not exceed the vocabulary size (That is it indexes each word)

In [30]:
xx = one_hot('king queen man woman unrelated', vocab_size)
xx

[8, 47, 15, 27, 11]

In [31]:
oh = np.array(xx).reshape(-1,1)
oh

array([[ 8],
       [47],
       [15],
       [27],
       [11]])

In [32]:
enc = OneHotEncoder()
enc.fit(np.array(range(vocab_size)).reshape(-1,1))
oh_enc = enc.transform(oh).toarray()
print('shape', oh_enc.shape)
print(oh_enc)

shape (5, 50)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]


In [33]:
encoded_docs = [one_hot(d, vocab_size) for d in docs]
encoded_docs

[[8, 41, 15], [29, 8, 15, 47], [47, 15, 8], [10, 11, 12], [47, 41, 27]]

In [34]:
max_length = 24
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs

array([[ 8, 41, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [29,  8, 15, 47,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [47, 15,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [10, 11, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [47, 41, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

## Create training data

In [35]:
tuples = np.empty((0,2))
for padded_doc  in padded_docs:
    length = len(padded_doc)
    for i in range(length):
        if padded_doc[i] != 0:
            if i < length-1 and padded_doc[i+1] != 0:
                tuples = np.append(tuples, [[padded_doc[i], padded_doc[i+1]]], axis=0)
                if i < length-2 and padded_doc[i+2] != 0:
                    tuples = np.append(tuples, [[padded_doc[i], padded_doc[i+2]]], axis=0)
                    
            if i > 0:
                tuples = np.append(tuples, [[padded_doc[i], padded_doc[i-1]]], axis=0)
                if i > 1:
                    tuples = np.append(tuples, [[padded_doc[i], padded_doc[i-2]]], axis=0)
                    
print(tuples.shape)
print(tuples)
                    
                    

(34, 2)
[[ 8. 41.]
 [ 8. 15.]
 [41. 15.]
 [41.  8.]
 [15. 41.]
 [15.  8.]
 [29.  8.]
 [29. 15.]
 [ 8. 15.]
 [ 8. 47.]
 [ 8. 29.]
 [15. 47.]
 [15.  8.]
 [15. 29.]
 [47. 15.]
 [47.  8.]
 [47. 15.]
 [47.  8.]
 [15.  8.]
 [15. 47.]
 [ 8. 15.]
 [ 8. 47.]
 [10. 11.]
 [10. 12.]
 [11. 12.]
 [11. 10.]
 [12. 11.]
 [12. 10.]
 [47. 41.]
 [47. 27.]
 [41. 27.]
 [41. 47.]
 [27. 41.]
 [27. 47.]]


In [36]:
onehotlabels_x = enc.transform(tuples[:, 0].reshape(-1,1)).toarray()

print(onehotlabels_x.shape)

(34, 50)


In [37]:
onehotlabels_y = enc.transform(tuples[:, 1].reshape(-1,1)).toarray()
print(onehotlabels_y.shape)

(34, 50)


In [60]:
input_ = Input(shape=[50])
h = Dense(2, activation='relu')(input_)
x = Dense(50, activation='softmax')(h)

model = Model(inputs=input_, outputs=x)

model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['acc'])

In [61]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 50)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 102       
_________________________________________________________________
dense_16 (Dense)             (None, 50)                150       
Total params: 252
Trainable params: 252
Non-trainable params: 0
_________________________________________________________________


In [62]:
epochs = 50

model.fit(onehotlabels_x, onehotlabels_y, epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x11a6a4dd8>

In [63]:
model2 = Model(inputs=input_, outputs=h)

model2.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['acc'])

test = model2.predict(oh_enc)
print(test)

[[1.5775301  1.4147451 ]
 [1.4600008  1.44418   ]
 [1.1103778  1.4671073 ]
 [0.9585242  1.1078153 ]
 [1.3889623  0.00233436]]


In [64]:
print(test[0,:] - test[2,:] + test[3,:] - test[2,:])

[ 0.3152988  -0.41165423]
