In [56]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.preprocessing import text,sequence
from keras.utils import np_utils,pad_sequences

In [57]:
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""

dl_data=data.split()

In [58]:
#Buliding Vocabulary
tokenizer=text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id=tokenizer.word_index

word2id['PAD']=0
id2word={v:k for k,v in word2id.items() }
wids=[[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(word2id)
embed_size=100
window_size=2
print("Vocabulary size:", vocab_size)
print("Some words and unique identifire:", list(word2id.items())[:10])

Vocabulary size: 75
Some words and unique identifire: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [59]:
#Creating context word pairs

def generating_context_word_pairs(corpus,window_size,vocab_size):
  context_length=window_size*2
  for words in corpus:
    sentence_length=len(words)
    for index,word in enumerate(words):
      context_words=[]
      label_word=[]
      start=index-window_size
      end=index+window_size+1
      context_words.append([words[i] for i in range(start,end) if 0 <= i< sentence_length and i!=index])
      label_word.append(word)

      x=pad_sequences(context_words,maxlen=context_length)
      y=np_utils.to_categorical(label_word,vocab_size)
      yield(x,y)
      

In [60]:
#model buliding
import keras.backend as K
model=keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=window_size*2),
    keras.layers.Lambda(lambda x:K.mean(x,axis=1),output_shape=embed_size),
    keras.layers.Dense(vocab_size,activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='rmsprop')
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 4, 100)            7500      
                                                                 
 lambda_10 (Lambda)          (None, 100)               0         
                                                                 
 dense_10 (Dense)            (None, 75)                7575      
                                                                 
Total params: 15,075
Trainable params: 15,075
Non-trainable params: 0
_________________________________________________________________
None


In [61]:
#Training the model
for epoch in range(1,6):
  loss=0
  i=0
  for x,y in generating_context_word_pairs(corpus=wids,window_size=window_size,vocab_size=vocab_size):
    loss+=model.train_on_batch(x,y)
    i+=1

  print("Epoch: ",epoch,"\tLoss:",loss)


Epoch:  1 	Loss: 433.69831895828247
Epoch:  2 	Loss: 428.9176354408264
Epoch:  3 	Loss: 425.36208391189575
Epoch:  4 	Loss: 422.0327968597412
Epoch:  5 	Loss: 419.6004343032837


In [64]:
weights=model.get_weights()[0]
weights.shape

(75, 100)

In [67]:
pd.DataFrame(weights, index=list(id2word.values())).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
learning,-0.00802,0.04538,-0.016031,0.03139,-0.029912,-0.046721,0.042185,0.035678,-0.04842,-0.042112,...,-0.071132,-0.022237,0.06252,-0.031422,-0.042875,0.051797,-0.026774,-0.020674,0.011435,-0.060332
deep,-0.013481,-0.027462,0.026784,0.064938,-0.026745,-0.005874,-0.030811,0.063556,-0.013238,-0.046902,...,-0.013907,-0.038321,-0.027823,0.028284,-0.013567,0.033972,-0.06028,0.013138,-0.014835,-0.055082
networks,-0.026907,0.021181,-0.039048,0.028921,0.022194,-0.025694,0.002333,-0.028814,-0.060204,-0.04408,...,0.020891,-0.022191,0.065067,0.03407,-0.018249,0.055904,0.038199,-0.029505,0.018243,0.025018
neural,0.044706,0.045275,0.004538,-0.024471,-0.047302,0.018649,-0.027069,0.029778,0.028577,0.049445,...,-0.012275,0.015505,-0.003788,-0.010143,-0.045596,-0.032164,-0.022637,0.00535,0.007236,-0.016424
and,-0.03961,0.016444,0.024884,0.046667,-0.034156,-0.03669,-0.049969,0.019135,-0.004311,0.01854,...,-0.020094,-0.023815,0.009976,0.043448,-0.02372,-0.016987,0.030974,-0.032734,-0.03632,-0.017206


In [69]:
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix=euclidean_distances(weights)
print(distance_matrix.shape)

(75, 75)


In [71]:
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
for search_term in ['deep','learning']}

similar_words

{'deep': ['inspection', 'to', 'part', 'some', 'representation'],
 'learning': ['deep', 'networks', 'supervised', 'game', 'in']}