In [1]:
import numpy as np
from keras.layers import Dense,LSTM,Activation,Input,Embedding,Dropout,Bidirectional
from keras.models import Model
import keras
import tensorflow as tf
from tensorflow.keras import regularizers

! pip install keras_self_attention
from keras_self_attention import SeqSelfAttention

Using TensorFlow backend.


Collecting keras_self_attention
  Downloading https://files.pythonhosted.org/packages/44/3e/eb1a7c7545eede073ceda2f5d78442b6cad33b5b750d7f0742866907c34b/keras-self-attention-0.42.0.tar.gz
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.42.0-cp36-none-any.whl size=17296 sha256=04af2f02611d8af4b1bc969e23d7481a72489f4673a9560d823d16fac25add35
  Stored in directory: /root/.cache/pip/wheels/7b/05/a0/99c0cf60d383f0494e10eca2b238ea98faca9a1fe03cac2894
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.42.0


In [0]:
data= open('/content/drive/My Drive/Colab Notebooks/sherlock.txt','r',encoding="utf8").read().lower().split()

In [6]:
for i in range(0,len(data)):
    data[i] = ''.join(e for e in data[i] if e.isalnum())  ## remove special characters
    data[i] = ''.join([i for i in data[i] if not i.isdigit()])  # remove numbers
len(data)

107406

In [0]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [0]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/drive/My Drive/Colab Notebooks/glove.6B.100d.txt')

In [9]:
# removing the words from data which are not in embedding
for i in range(0,10):
    for w in data:
        try:
            if(word_to_index[w]):
                None
        except:
            data.remove(w)
        
len(data)

100319

In [10]:
unique_word=np.unique(data)
print('unique_words:-',unique_word.shape[0])

unique_words:- 7654


In [0]:
#Vocabulary
dic = {}
for w in unique_word:
    dic[w]=word_to_vec_map[w]/3.7

In [0]:
word_to_ix = { ch:i for i,ch in enumerate(unique_word) }
ix_to_word = { i:ch for i,ch in enumerate(unique_word) }

In [13]:
words_length = 10
prev_word = []
next_word = []
for i in range(0,len(data)-words_length):
    prev_word.append(data[i:i+words_length])
    next_word.append(data[i+words_length])
print('No of samples : ',len(prev_word))  # data size

No of samples :  100309


In [0]:
def sentence_to_indices(prev_word,next_word,word_to_index,word_length):
    m=len(prev_word)
    X=np.zeros((m,word_length))
    Y=np.zeros((m,1))
    for i in range(0,m):
        j=0
#         print(i,end=" ")
        for w in prev_word[i]:
            X[i,j]=word_to_index[w]
            j=j+1
        Y[i,0]=word_to_index[next_word[i]]
            
    return X,Y

In [0]:
X_data,Y_data = sentence_to_indices(prev_word,next_word,word_to_ix,words_length)

In [0]:
Y_onehot=np.zeros((len(Y_data),len(unique_word)),dtype=bool)
for i in range(0,len(Y_data)):
    Y_onehot[i,int(Y_data[i])]=1

In [0]:
# train test split
num=90000
X_train = X_data[:num]
Y_train = Y_onehot[:num]
X_test  = X_data[num:]
Y_test  = Y_onehot[num:]

In [0]:
x_input=Input(shape=(words_length,))
x=Embedding(input_dim=unique_word.shape[0]+1,output_dim=100)(x_input)
x=Bidirectional(LSTM(128,return_sequences=True))(x)
x=SeqSelfAttention(attention_activation='sigmoid')(x)
x=LSTM(128)(x)
x=Dropout(0.2)(x)
x=Dense(int(unique_word.shape[0]/2),activation='relu',kernel_regularizer=regularizers.l2(0.01))(x)
x=Dense(unique_word.shape[0])(x)
x=Activation('softmax')(x)

In [0]:
model=Model(x_input,x)

In [20]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 100)           765500    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 256)           234496    
_________________________________________________________________
seq_self_attention_1 (SeqSel (None, 10, 256)           16449     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3827)              4936

In [21]:
#
for l in model.layers:
    if 'embedding' in l.name:
        a=model.get_layer(l.name)
        b=a.get_weights()
        c=b[0]
        i=0
        for w in dic:
            c[i]=dic[w]
            i=i+1
        b[0]=c
        a.set_weights(b)
        print('done')

done


In [0]:
for l in model.layers:
    if 'embedding' in l.name:
        l.trainable=False

In [23]:
for l in model.layers:
    print(l.name,l.trainable)

input_1 False
embedding_1 False
bidirectional_1 True
seq_self_attention_1 True
lstm_2 True
dropout_1 True
dense_1 True
dense_2 True
activation_1 True


In [0]:
los=tf.keras.losses.CategoricalCrossentropy()
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss=los,optimizer=opt,metrics=['accuracy'])

In [29]:
model.fit(X_train,Y_train,epochs=50,batch_size=1024,shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f91ffde5e48>

In [0]:
model.save('/content/drive/My Drive/Colab Notebooks/next_word.h5')

In [30]:
x = X_test[900]
t=np.zeros((1,41))
for w in x:
    print(ix_to_word[w]," ")

is  
something  
distinctly  
novel  
about  
some  
of  
the  
features  
if  


In [33]:
#predicting next 30 words 
for i in range(0,30):
    y = model.predict(np.reshape(x,(1,10)))
    y = np.argmax(y)
    print(ix_to_word[y]," ")
    for i in range(0,len(x)-1):
        x[i]=x[i+1]
    x[9]=y

lord  
st  
simon  
i  
have  
given  
you  
a  
better  
good  
i  
understand  
that  
i  
have  
been  
able  
to  
tell  
you  
i  
knew  
that  
i  
had  
done  
to  
be  
able  
to  
