#### - [KDNuggets CBOW](https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html)
#### - [Checkout for Skip-gram](https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa)
#### - [Original Paper](https://arxiv.org/pdf/1301.3781.pdf)
#### - [Keras Embedding Layer](https://keras.io/layers/embeddings/#embedding)
#### - [Glove Vectors](https://nlp.stanford.edu/projects/glove/)

In [26]:
import nltk
nltk.download("popular")

from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

import numpy as np
from matplotlib import pyplot as plt
import re

from nltk.corpus import stopwords

import keras
from keras.layers import Dense, Activation, Input, Dropout
from keras.models import Model

f = open('../Datasets/sherlock.txt')
data = f.read()
f.close()

data = data[3433:50000]

data = data.lower()
data = re.sub('[^A-Za-z]+', ' ', data)
data = data.split()

stop_words = stopwords.words('english')
data = [word for word in data if word not in stop_words] 

#data=data[:100000]

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]
print(wids)
vocab_size = len(word2id)
embed_size = 100
window_size = 1 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/rishabkapoor/nltk_data...
[nltk_data]    

In [27]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    j=1
    for corpus_index,words in enumerate(corpus):
        if(j==1 or j==2 or j==3 or j==4 or j==5):
            print("words is ",words)
        sentence_length = len(words)
        if(j==1 or j==2 or j==3 or j==4 or j==5):
          print("sentence length is",sentence_length)
        #print(sentence)
        for index, word in enumerate(words):
            if(j==1 or j==2 or j==3 or j==4 or j==5):
              print("index is",index)
            if(j==1 or j==2 or j==3 or j==4 or j==5):
              print("word is ",word)
            context_words = []
            label_word   = []            
            start = corpus_index - window_size
            end = corpus_index + window_size + 1
            
            context_words.append([corpus[i][0] 
                                 for i in range(start, end) 
                                 if i>=start and i < end 
                                 and i != corpus_index and i>=0 and i<len(corpus)-1])
            if(j==1 or j==2 or j==3 or j==4 or j==5):
              print("Context words are:",context_words)
            label_word.append(word)
            if(j==1 or j==2 or j==3 or j==4 or j==5):
              print("label words are:",label_word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            if(j==1 or j==2 or j==3):
              print("x is ",x)
            y = np_utils.to_categorical(label_word, vocab_size)
            if(j==1 or j==2 or j==3):
              print("y is ",y)
            j=j+1
            yield (x, y)

In [28]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2, 100)            185300    
_________________________________________________________________
lambda_5 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1853)              187153    
Total params: 372,453
Trainable params: 372,453
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % len(data) == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

words is  [304]
sentence length is 1
index is 0
word is  304
Context words are: [[193]]
label words are: [304]
x is  [[  0 193]]
y is  [[0. 0. 0. ... 0. 0. 0.]]
words is  [193]
sentence length is 1
index is 0
word is  193
Context words are: [[304, 305]]
label words are: [193]
x is  [[304 305]]
y is  [[0. 0. 0. ... 0. 0. 0.]]
words is  [305]
sentence length is 1
index is 0
word is  305
Context words are: [[193, 194]]
label words are: [305]
x is  [[193 194]]
y is  [[0. 0. 0. ... 0. 0. 0.]]
words is  [194]
sentence length is 1
index is 0
word is  194
Context words are: [[305, 306]]
label words are: [194]
words is  [306]
sentence length is 1
index is 0
word is  306
Context words are: [[194, 609]]
label words are: [306]


SystemError: <class 'range'> returned a result with an error set
Apply node that caused the error: Elemwise{Composite{(i0 - ((i1 * i2) / (i3 + sqrt(clip(i4, i5, i6)))))}}[(0, 0)](embedding_5/embeddings, InplaceDimShuffle{x,x}.0, AdvancedIncSubtensor1{inplace,inc}.0, TensorConstant{(1, 1) of 1e-07}, Elemwise{Composite{((i0 * i1) + (i2 * sqr(i3)))}}[(0, 1)].0, TensorConstant{(1, 1) of 0.0}, TensorConstant{(1, 1) of inf})
Toposort index: 69
Inputs types: [TensorType(float32, matrix), TensorType(float32, (True, True)), TensorType(float32, matrix), TensorType(float32, (True, True)), TensorType(float32, matrix), TensorType(float32, (True, True)), TensorType(float32, (True, True))]
Inputs shapes: [(1853, 100), (1, 1), (1853, 100), (1, 1), (1853, 100), (1, 1), (1, 1)]
Inputs strides: [(400, 4), (4, 4), (400, 4), (4, 4), (400, 4), (4, 4), (4, 4)]
Inputs values: ['not shown', array([[0.001]], dtype=float32), 'not shown', array([[1.e-07]], dtype=float32), 'not shown', array([[0.]], dtype=float32), array([[inf]], dtype=float32)]
Outputs clients: [['output']]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [None]:
import pandas as pd
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['year','took','doctor','medicine']}

similar_words