## Import Stuff

In [None]:
from tensorflow.keras.layers import Embedding
from numpy.linalg import norm
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

## Override Keras Embedding Class

similarity is measured by Cosine Similarity

In [None]:
class Embedding2(Embedding):
  def measure_similarity(self, a, b):
    return (a @ b.T) / (norm(a)*norm(b))

## Classification model to train on text data

this code is partly copy pasted from the internet and is only used so that my embedding layer is trained

In [None]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding2(vocab_size, 8, input_length=max_length, name='embed'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[[16, 3], [32, 46], [6, 35], [25, 46], [2], [5], [43, 35], [3, 32], [43, 46], [9, 18, 3, 7]]
[[16  3  0  0]
 [32 46  0  0]
 [ 6 35  0  0]
 [25 46  0  0]
 [ 2  0  0  0]
 [ 5  0  0  0]
 [43 35  0  0]
 [ 3 32  0  0]
 [43 46  0  0]
 [ 9 18  3  7]]
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding2)           (None, 4, 8)              400       
_________________________________________________________________
flatten_11 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998


## Get output of Embedding Layer

In [None]:
embedding_model = Model(inputs=model.input,
                              outputs= model.get_layer('embed').output)

embedding_output = embedding_model(padded_docs)

## Measuring similarity

In [None]:
well_done = docs[0]
well_embedding = embedding_output[0][0]
done_embedding = embedding_output[0][1]

good_work = docs[1]
good_embedding = embedding_output[1][0]
work_embedding = embedding_output[1][1]

weak = docs[5]
weak_embedding = embedding_output[5][0]

a = model.get_layer('embed').measure_similarity(good_embedding.numpy(), well_embedding.numpy())
print("Well - Good : Strong positive correlation")
print(a)
b = model.get_layer('embed').measure_similarity(well_embedding.numpy(), work_embedding.numpy())
print("Well - Work: I dont know")
print(b)
c = model.get_layer('embed').measure_similarity(weak_embedding.numpy(), done_embedding.numpy())
print("Weak - Done: I dont know")
print(c)
d = model.get_layer('embed').measure_similarity(weak_embedding.numpy(), good_embedding.numpy())
print("Weak - Good: Strong negative correlation")
print(d)

Well - Good : Strong positive correlation
0.39971587
Well - Work: I dont know
-0.4904759
Weak - Done: I dont know
0.020694826
Weak - Good: Strong negative correlation
-0.5158577
