In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

# tokenizing
from nltk import word_tokenize,sent_tokenize

#keras

from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Flatten ,Embedding,Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [7]:
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

In [8]:
vocab_size=50 
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [46, 30, 21, 21, 29, 14]
The encoding for document 2  is :  [45, 15, 21, 29, 14, 10, 21, 21, 41]
The encoding for document 3  is :  [6, 44, 30, 27, 46, 14, 24, 19, 15, 41, 14, 46]


In [9]:
# length of maximum document. will be nedded whenever create embeddings for the words
maxlen=-1
#nltk.download('punkt')
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen=len(tokens)
print("The maximum number of words in any document is : ",maxlen)

The maximum number of words in any document is :  12


In [10]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

No of padded documents:  3


In [11]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [46 30 21 21 29 14  0  0  0  0  0  0]
The padded encoding for document 2  is :  [45 15 21 29 14 10 21 21 41  0  0  0]
The padded encoding for document 3  is :  [ 6 44 30 27 46 14 24 19 15 41 14 46]


In [12]:
# specifying the input shape
input=Input(shape=(no_docs,maxlen),dtype='float64')

In [13]:
'''
shape of input. 
each document has 12 element or words which is the value of our maxlen variable.

'''
word_input=Input(shape=(maxlen,),dtype='float64')  

# creating the embedding
word_embedding=Embedding(input_dim=vocab_size,output_dim=8,input_length=maxlen)(word_input)

word_vec=Flatten()(word_embedding) # flatten
embed_model =Model([word_input],word_vec) # combining all into a Keras model

In [14]:
embed_model.compile(optimizer=Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) 

# compiling the model. parameters can be tuned as always.

In [15]:
print(embed_model.summary()) # summary of the model

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 12, 8)             400       
_________________________________________________________________
flatten (Flatten)            (None, 96)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
embeddings=embed_model.predict(pad_corp) # finally getting the embeddings.

In [17]:
print("Shape of embeddings : ",embeddings.shape)
print(embeddings[0])

Shape of embeddings :  (3, 96)
[-0.04831005  0.00818304 -0.01519667  0.00417691  0.0302623   0.03631613
 -0.037296   -0.03661499  0.02009379  0.04430156 -0.00048839 -0.01056299
 -0.02379319  0.03943712  0.00181495 -0.00461334  0.01345823  0.04482469
  0.01656128 -0.03393574  0.03245096 -0.0285756   0.01699558  0.04166646
  0.01345823  0.04482469  0.01656128 -0.03393574  0.03245096 -0.0285756
  0.01699558  0.04166646 -0.00875279 -0.01546627  0.04310519 -0.01682319
 -0.04134415  0.00662705 -0.04181617 -0.02410871 -0.00555418 -0.03757465
  0.02760709 -0.04612336  0.01748511  0.03318616  0.01062608  0.02237599
 -0.00038352 -0.00951105 -0.02464424 -0.0131023  -0.00691489 -0.00609851
  0.04988987 -0.04905727 -0.00038352 -0.00951105 -0.02464424 -0.0131023
 -0.00691489 -0.00609851  0.04988987 -0.04905727 -0.00038352 -0.00951105
 -0.02464424 -0.0131023  -0.00691489 -0.00609851  0.04988987 -0.04905727
 -0.00038352 -0.00951105 -0.02464424 -0.0131023  -0.00691489 -0.00609851
  0.04988987 -0.049057

In [18]:
embeddings=embeddings.reshape(-1,maxlen,8)
print("Shape of embeddings : ",embeddings.shape) 
print(embeddings)

Shape of embeddings :  (3, 12, 8)
[[[-0.04831005  0.00818304 -0.01519667  0.00417691  0.0302623
    0.03631613 -0.037296   -0.03661499]
  [ 0.02009379  0.04430156 -0.00048839 -0.01056299 -0.02379319
    0.03943712  0.00181495 -0.00461334]
  [ 0.01345823  0.04482469  0.01656128 -0.03393574  0.03245096
   -0.0285756   0.01699558  0.04166646]
  [ 0.01345823  0.04482469  0.01656128 -0.03393574  0.03245096
   -0.0285756   0.01699558  0.04166646]
  [-0.00875279 -0.01546627  0.04310519 -0.01682319 -0.04134415
    0.00662705 -0.04181617 -0.02410871]
  [-0.00555418 -0.03757465  0.02760709 -0.04612336  0.01748511
    0.03318616  0.01062608  0.02237599]
  [-0.00038352 -0.00951105 -0.02464424 -0.0131023  -0.00691489
   -0.00609851  0.04988987 -0.04905727]
  [-0.00038352 -0.00951105 -0.02464424 -0.0131023  -0.00691489
   -0.00609851  0.04988987 -0.04905727]
  [-0.00038352 -0.00951105 -0.02464424 -0.0131023  -0.00691489
   -0.00609851  0.04988987 -0.04905727]
  [-0.00038352 -0.00951105 -0.02464424 -

In [19]:
for i,doc in enumerate(embeddings):
    for j,word in enumerate(doc):
        print("The encoding for ",j+1,"th word","in",i+1,"th document is : \n\n",word)

The encoding for  1 th word in 1 th document is : 

 [-0.04831005  0.00818304 -0.01519667  0.00417691  0.0302623   0.03631613
 -0.037296   -0.03661499]
The encoding for  2 th word in 1 th document is : 

 [ 0.02009379  0.04430156 -0.00048839 -0.01056299 -0.02379319  0.03943712
  0.00181495 -0.00461334]
The encoding for  3 th word in 1 th document is : 

 [ 0.01345823  0.04482469  0.01656128 -0.03393574  0.03245096 -0.0285756
  0.01699558  0.04166646]
The encoding for  4 th word in 1 th document is : 

 [ 0.01345823  0.04482469  0.01656128 -0.03393574  0.03245096 -0.0285756
  0.01699558  0.04166646]
The encoding for  5 th word in 1 th document is : 

 [-0.00875279 -0.01546627  0.04310519 -0.01682319 -0.04134415  0.00662705
 -0.04181617 -0.02410871]
The encoding for  6 th word in 1 th document is : 

 [-0.00555418 -0.03757465  0.02760709 -0.04612336  0.01748511  0.03318616
  0.01062608  0.02237599]
The encoding for  7 th word in 1 th document is : 

 [-0.00038352 -0.00951105 -0.02464424 