**Note: The following comes from [Use Word Embedding Layers - Deep Learning Keras](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/).**
# Training Your Own Embeddings

You'll need to make sure that `tensorflow` and `keras` are installed:
```
pip install tensorflow keras
```

### Note:
The following is NOT implementing `word2vec`. It is simply training an embedding layer via a vanilla neural network.

In [1]:
from numpy import array


In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [None]:
# define documents
docs = ['Well done!',
        'Good work',
        "Awesome job",
        "Amazing",
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good really bad',
        'poor work',
        "Weak, not well done",
        'Poor job',
        'Weak and terrible',
        'Not very good',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]) # 1 means it is positive, 0 means it is negative

## Define the Vocab Size

In [None]:
# you set the vocabulary size to some number that represents the total number of unique words in your vocabulary
vocab_size = 50

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
tokenizer.word_index

In [None]:
from keras.preprocessing.text import text_to_word_sequence

## Integer Encode the Documents

In [None]:
from typing import List
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

In [None]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [None]:
# integer encode the documents
encoded_docs = integer_encode_documents(docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
# Notice the last document has 4 numbers, since it is a 4 word document: Could have done better.
from pprint import pprint
pprint(encoded_docs)

## Get Max Length of Tokens

In [None]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [None]:
# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

## Pad Documents to Max Length

In [None]:
# pad documents to a max length of 4 words
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_docs)
# since the max length is 4 words in a document, we pad all the documents to have 4 words, just set index to 0
# if it doesn't have any words

Note: important design consideration - pad zeros after the document, or before?

## Define an Embedding Size
This represents how many numbers will "represent" a word. In `word2vec`, this would be 100, or 300.

In [None]:
EMBEDDING_SIZE = 8

# Define Our Deep Learning Model

In [None]:
# define the model
# remember, vocab_size = 50
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 
# for each document, the output of the embedding layer is 4 x 8 matrix
# (4 since 4 words per document, 8 since size 8 embedding). Flatten makes this a 32 x 1 vector.
model.add(Dense(1, activation='sigmoid')) 
# these 32 elements are coalesced into one final output node, a sigmoid
# that outputs a probability of positive or negative

!["ada"](images/architecture.png "Logo Title Text 1")

In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

In [None]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

### Get Embedding Layer Weights

In [None]:
embedding_layer = model.layers[0]
embedding_layer.get_weights()[0].shape

## Test Prediction

In [None]:
encoded_test_docs = integer_encode_documents(["Awesome work", 
                                              "Really bad, terrible", 
                                              "amazing work"], tokenizer)

# pad test documents
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_test_docs)

In [None]:
prediction = model.predict(padded_test_docs, verbose=0)
prediction

# Using Pre-Trained Embeddings

In [None]:
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print("Encoded docs:\n", encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print("Padded docs:\n", padded_docs)
# load the whole embedding into memory
embeddings_index = dict()

In [None]:
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

## Pre-Load the Weight Matrix

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

In [None]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

!["ada"](images/architecture2.png "Logo Title Text 1")

In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
model.predict(padded_docs)

In [None]:
encoded_docs

In [None]:
from typing import List
def integer_encode_documents(docs: List[str])-> List[List[int]]:
    return [one_hot(d, vocab_size) for d in docs]

In [None]:
encoded_test_docs = integer_encode_documents(["Good work"])

# pad test documents
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_test_docs)
model.predict(padded_test_docs)

## Word Embeddings with Amazon Toy Reviews Dataset

In [None]:
import numpy as np

NUM_SAMPLES = 5000

good_reviews = open("good_amazon_toy_reviews.txt").readlines()
bad_reviews = open("poor_amazon_toy_reviews.txt").readlines()

sampled_good_reviews = good_reviews[:NUM_SAMPLES]
sampled_bad_reviews = bad_reviews[:NUM_SAMPLES]

docs = sampled_good_reviews + sampled_bad_reviews
labels = np.concatenate([np.ones(NUM_SAMPLES), np.zeros(NUM_SAMPLES)])

## Remove Stopwords Using Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
stopwords_removed_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), docs))

## Tokenize the Text
I'm just using perhaps the most basic tokenization possible from Keras. Read [the documentation for more options](https://keras.io/preprocessing/text/). The most notable options:
* `num_words`: the maximum number of words to keep, based on word frequency.
* `oov_token`: adds a `OOB` (out of bag) or `OOV` (out of vocabulary) token to the `word_index`.

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_docs)

In [None]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [None]:
# integer encode the documents
encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
# Notice the last document has 4 numbers, since it is a 4 word document: Could have done better.
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
vocab_size = int(len(tokenizer.word_index) * 1.3)
print(f"Vocab size is {vocab_size} unique tokens.")

In [None]:
EMBEDDING_SIZE = 50

In [None]:
from keras.utils.vis_utils import plot_model

# Define and Compile the Model

In [None]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 
# for each document, the output of the embedding layer is 4 x 8 matrix
# (4 since 4 words per document, 8 since size 8 embedding). Flatten makes this a 32 x 1 vector.
model.add(Dense(1, activation='sigmoid')) 
# these 32 elements are coalesced into one final output node, a sigmoid
# that outputs a probability of positive or negative
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)


## Fit the Model

In [None]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

In [None]:
final_embeddings = model.layers[0].get_weights()[0]

In [None]:
final_embeddings.shape

In [None]:
final_embeddings = final_embeddings[:len(tokenizer.word_index)]

In [None]:
embeddings_dict = {token: embedding for token, embedding in zip(tokenizer.word_index, final_embeddings)}

# Find Similarities of the new Embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarities = pd.DataFrame(cosine_similarity(final_embeddings), 
                            index=tokenizer.word_index, columns=tokenizer.word_index)

In [None]:
# unstack matrix into table
similarity_table = similarities.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()
# rename columns
similarity_table.columns = ["word1", "word2", "similarity"]
similarity_table.shape

similarity_table = similarity_table[similarity_table["similarity"] < 0.99]
similarity_table.shape

In [None]:
similarity_table.sort_values(by="similarity", ascending=False).drop_duplicates(
    subset="similarity", keep="first").head(30)