In [1]:
from collections import Counter, OrderedDict
import numpy as np
import string
import re
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, Embedding
from keras.layers import LSTM, CuDNNGRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def data():
    
    punc_table = str.maketrans({key: None for key in string.punctuation})
    sentences = []
    targets = []
    with open('dataset.txt', 'r') as filee:
        for i, line in enumerate(filee):
            words = line.strip('\n').split('\t')
            word = words[0]
            definitions = words[1].split('; ')
            for definition in definitions:
                definition = re.sub("[\(\[].*?[\)\]]", "", definition).replace('  ', ' ')
                if definition[-1] == ' ':
                    definition = definition[:-1]

                temp_word_list = definition.translate(punc_table).lower().split(' ')
                temp_word_list = list(filter(None, temp_word_list))
                sentences.append(['<start>'] + temp_word_list + ['<end>'])
                targets.append(word)
                
    words = [word for word_sublist in sentences for word in word_sublist] 
    inf = float('inf')
    frequency_dict = OrderedDict({'<end>': inf, '<start>': inf})
    words_frequency_dict = sorted(Counter(words).most_common(None), key=lambda x:x[1], reverse=True)
    defs_frequency_dict = sorted(Counter(targets).most_common(None), key=lambda x:x[1], reverse=True)

    frequency_dict.update(words_frequency_dict)
    frequency_dict.update(defs_frequency_dict)
    frequency_dict.move_to_end('<start>', last=False)

    word2idx = OrderedDict([(item[0], i) for i,item in enumerate(frequency_dict.items())])
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    
    return sentences, targets, word2idx, idx2word

In [3]:
def sent2idx(sentences, targets, word2idx):
    s = []
    for sentence in sentences:
        s.append([word2idx[word] for word in sentence])
    t = []
    for word in targets:
        t.append(word2idx[word])
    return s, t

In [4]:
SEQUENCE_LENGTH = 55
BATCH_SIZE = 100
EPOCHS = 100

sentences, targets, word2idx, idx2word = data()
sentences, targets = sent2idx(sentences, targets, word2idx)
targets = np_utils.to_categorical(targets)
batches = len(sentences) // BATCH_SIZE
print('Sentences:',len(sentences))
print('Vocab Size:',len(word2idx))
print('Batches:',batches)

Sentences: 3281
Vocab Size: 5817
Batches: 32


In [5]:
model = Sequential()
model.add(Embedding(input_dim = len(word2idx), output_dim = 512))
model.add(CuDNNGRU(512, return_sequences = True))
model.add(Dropout(0.2))
model.add(CuDNNGRU(512))
model.add(Dropout(0.2))
model.add(Dense(len(word2idx), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 512)         2978304   
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, None, 512)         1575936   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 512)               1575936   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5817)              2984121   
Total params: 9,114,297
Trainable params: 9,114,297
Non-trainable params: 0
_________________________________________________________________


In [6]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [7]:
# Use the other generator
'''
def generator2(inputs, targets, batch_size):
    
    batches = len(inputs) // batch_size
    for i in range(batches):
        if i == batches-1:
            batch_inputs = pad_sequences(inputs[i*batch_size:])
            batch_targets = targets[i*batch_size:]
        else:
            batch_inputs = pad_sequences(inputs[i*batch_size: (i+1)*batch_size])
            batch_targets = targets[i*batch_size: (i+1)*batch_size]
            
        yield batch_inputs, batch_targets
'''

In [11]:
def generator(inputs, targets, batch_size):
    inputs = np.array(inputs)
    while True:
        perm = np.random.permutation(len(inputs))[:batch_size]
        batch_inputs = pad_sequences(inputs[perm])
        batch_targets = targets[perm]
        
        yield batch_inputs, batch_targets

In [12]:
input_ = np.array(targets)
model.fit_generator(generator = generator(sentences, targets, BATCH_SIZE), epochs = EPOCHS, samples_per_epoch = batches, callbacks=callbacks_list)

  


Epoch 1/100

Epoch 00001: loss improved from inf to 8.56523, saving model to weights-improvement-01-8.5652.hdf5
Epoch 2/100

Epoch 00002: loss improved from 8.56523 to 8.16107, saving model to weights-improvement-02-8.1611.hdf5
Epoch 3/100

Epoch 00003: loss improved from 8.16107 to 7.82354, saving model to weights-improvement-03-7.8235.hdf5
Epoch 4/100

Epoch 00004: loss improved from 7.82354 to 7.35065, saving model to weights-improvement-04-7.3506.hdf5
Epoch 5/100

Epoch 00005: loss improved from 7.35065 to 6.79990, saving model to weights-improvement-05-6.7999.hdf5
Epoch 6/100

Epoch 00006: loss improved from 6.79990 to 6.25660, saving model to weights-improvement-06-6.2566.hdf5
Epoch 7/100

Epoch 00007: loss improved from 6.25660 to 5.55539, saving model to weights-improvement-07-5.5554.hdf5
Epoch 8/100

Epoch 00008: loss improved from 5.55539 to 4.82983, saving model to weights-improvement-08-4.8298.hdf5
Epoch 9/100

Epoch 00009: loss improved from 4.82983 to 4.18283, saving mode


Epoch 00048: loss improved from 0.06127 to 0.05871, saving model to weights-improvement-48-0.0587.hdf5
Epoch 49/100

Epoch 00049: loss did not improve
Epoch 50/100

Epoch 00050: loss improved from 0.05871 to 0.05338, saving model to weights-improvement-50-0.0534.hdf5
Epoch 51/100

Epoch 00051: loss did not improve
Epoch 52/100

Epoch 00052: loss did not improve
Epoch 53/100

Epoch 00053: loss did not improve
Epoch 54/100

Epoch 00054: loss did not improve
Epoch 55/100

Epoch 00055: loss did not improve
Epoch 56/100

Epoch 00056: loss improved from 0.05338 to 0.05016, saving model to weights-improvement-56-0.0502.hdf5
Epoch 57/100

Epoch 00057: loss did not improve
Epoch 58/100

Epoch 00058: loss improved from 0.05016 to 0.04785, saving model to weights-improvement-58-0.0479.hdf5
Epoch 59/100

Epoch 00059: loss improved from 0.04785 to 0.04740, saving model to weights-improvement-59-0.0474.hdf5
Epoch 60/100

Epoch 00060: loss did not improve
Epoch 61/100

Epoch 00061: loss improved fro

<keras.callbacks.History at 0x7f223a8f1358>

In [13]:
model.save('weights.h5')

In [48]:
definition = 'examine closely at something'
words = definition.split(' ')
idxs = []
for word in words:
    idxs.append(word2idx[word])

idxs = np.array([0] + idxs + [1]).reshape((1,len(idxs) + 2))
prediction = model.predict(idxs, verbose=0)
index = np.argmax(prediction)
meaning = idx2word[index]

In [49]:
meaning

'must'