### option2 : Training own word2vec model 

In [1]:
# defined tokenized sentences as a training data
tokenized_sentences = [['Hello', 'This', 'is', 'python', 'training', 'by', 'aman'],
                      ['Hello','This','is','Java','training','by','aman'],
                      ['Hello','This','is','Data Science','training','by','unfold','data','science'],
                      ['Hello','This','is','programming','training','']]

In [2]:
# training word2vec model
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
mymodel = Word2Vec(tokenized_sentences, min_count=1)

In [3]:
# Summarizing the loaded model
print(mymodel)

Word2Vec(vocab=14, size=100, alpha=0.025)


In [5]:
# summarize vocabulary
words = list(mymodel.wv.vocab)

In [6]:
print(words)

['Hello', 'This', 'is', 'python', 'training', 'by', 'aman', 'Java', 'Data Science', 'unfold', 'data', 'science', 'programming', '']


In [7]:
#access word vector for one word "training"
print(mymodel['training'])

[ 1.7652006e-03  4.2132447e-03  2.7331195e-03  1.2549865e-03
 -4.2115119e-03  4.3915445e-03  3.3015842e-03  4.5203082e-03
 -9.0714358e-04  4.2450163e-03 -4.7139265e-04  7.7937159e-04
 -4.6052318e-03  7.7136169e-04  3.5639168e-03  3.5037040e-03
 -1.1604426e-03 -3.9611515e-03 -2.8210229e-03 -1.5761516e-03
  1.1480581e-03 -4.8129722e-03  1.7730580e-03  1.8463003e-03
  9.2621770e-04 -3.4174565e-03  4.3649445e-03 -1.2241554e-03
  2.6112152e-03 -1.4875482e-03  4.6623074e-03 -2.9707763e-03
 -4.2576655e-03  4.6593715e-03 -2.4806480e-03 -6.2086200e-04
  1.5933268e-03 -4.1002943e-03 -8.1407739e-04  4.9769813e-03
 -4.0604390e-04 -4.7034146e-03  7.6764665e-04  2.4228627e-03
  4.8451624e-03  4.8018512e-03 -1.6798589e-03  2.7910552e-03
  4.6154265e-03  3.7186374e-03 -3.9215353e-03 -4.2517749e-03
  1.3932454e-03 -1.0507343e-03  2.9044992e-03 -2.3074129e-03
  1.0726132e-03 -3.2585831e-03 -3.4364329e-03  1.5551604e-03
 -8.6220043e-06 -3.6030198e-03 -1.3437431e-03  3.4774707e-03
  1.2288950e-04 -4.46871

In [11]:
# try finding most similar words for "Data"
mymodel.most_similar("data")

[('Data Science', 0.21103496849536896),
 ('', 0.16730409860610962),
 ('is', 0.14476194977760315),
 ('science', 0.13703784346580505),
 ('Java', 0.12803393602371216),
 ('training', 0.08047150075435638),
 ('aman', 0.07607199996709824),
 ('by', 0.06819257140159607),
 ('Hello', 0.052879899740219116),
 ('This', 0.02246938645839691)]

### option 3: Create Embedding model using keras Embedding

In [13]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [14]:
#define documents
sent = ['Hello, how are you',
       'how are you',
       'how are you doing',
       'I am Doing great',
       'I am doing good',
       'I am good']

In [15]:
# defining class labels
sent_labels = array([1,1,1,0,0,0])

In [16]:
# integer encoding of the documents
my_vocab_size = 30
encoded_sent = [one_hot(i, my_vocab_size) for i in sent]
print(encoded_sent)

[[14, 16, 7, 17], [16, 7, 17], [16, 7, 17, 2], [18, 29, 2, 17], [18, 29, 2, 27], [18, 29, 27]]


In [17]:
# padding documents to a max length=5
length = 5
padded_sent = pad_sequences(encoded_sent, maxlen=length, padding='pre')
print(padded_sent)

[[ 0 14 16  7 17]
 [ 0  0 16  7 17]
 [ 0 16  7 17  2]
 [ 0 18 29  2 17]
 [ 0 18 29  2 27]
 [ 0  0 18 29 27]]


In [46]:
#defining the model
mymodel = Sequential()
mymodel.add(Embedding(my_vocab_size, 8, input_length=length))
mymodel.add(Flatten())
mymodel.add(Dense(1, activation='sigmoid'))

In [21]:
# compiling the model
mymodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
# fitting the model
mymodel.fit(padded_sent, sent_labels, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x209964da6a0>

In [25]:
# Evaluate the model
modelloss, modelaccuracy = mymodel.evaluate(padded_sent, sent_labels, verbose=0)
print('Accuracy:%f'%(modelaccuracy*100))

Accuracy:100.000000


### The Prediction part

In [26]:
mysent_to_predict = ['how are you suman',
                    'I am good']

In [27]:
#integer code the documents
vocab_size = 30
encoded = [one_hot(d, vocab_size) for d in mysent_to_predict]
print(encoded)

[[16, 7, 17, 12], [18, 29, 27]]


In [29]:
#pad documents to a max length of 5 words
max_length = 5
mypadded = pad_sequences(encoded, maxlen=max_length, padding='pre')
print(mypadded)

[[ 0 16  7 17 12]
 [ 0  0 18 29 27]]


In [44]:
import numpy as np
predict =((mymodel.predict(mypadded)>0.5).astype("int32"))

In [45]:
predict

array([[1],
       [0]])