# Language Modeling with Keras

In [1]:
import itertools
import h5py
import numpy as np
import string
from keras.layers import Input, Embedding, merge, Flatten, Reshape, Lambda, LSTM
import keras.backend as K
from keras.models import Model

Using TensorFlow backend.


In [2]:

#imports a massive dataset of sentences found on wikipedia
with open('training_data.txt') as f:
    content = f.readlines()

# you may also want to remove whitespace characters like `\n` at the end of each line,
# also strips the punctuation
sentences = [x.strip().translate(None, string.punctuation) for x in content] 

lemma = lambda x: x.strip().lower().split(' ')
sentences_lemmatized = [lemma(sentence) for sentence in sentences]
words = set(itertools.chain(*sentences_lemmatized))
# set(['boy', 'fed', 'ate', 'cat', 'kicked', 'hat'])

# dictionaries for converting words to integers and vice versa
word2idx = dict((v, i) for i, v in enumerate(words))
idx2word = list(words)

# convert the sentences a numpy array
to_idx = lambda x: [word2idx[word] for word in x]
sentences_idx = [to_idx(sentence) for sentence in sentences_lemmatized]
# Sets the maximum word length of each sentence
max_len = 100
# If the sentence is too long, good by
for i in range(len(sentences_idx)-1, -1, -1):
    if len(sentences_idx[i]) > max_len:
        print i
        sentences_idx.pop(i)
    elif len(sentences_idx[i]) < max_len:
        sentences_idx[i] = [0] * (max_len - len(sentences_idx[i])) + sentences_idx[i]
sentences_array = np.asarray(sentences_idx, dtype='int32')
#TODO find a way to set all sentence lengths to be the same

1748


In [3]:
print len(sentences_idx)
print len(word2idx)

3148
3727


## Create and Train The Model

In [25]:
n_words = len(words)
n_embed_dims = 150

# put together a model to predict
input_sentence = Input(shape=(max_len,), dtype='int32')
input_embedding = Embedding(n_words, n_embed_dims)(input_sentence)
output = LSTM(100)(input_embedding)

model = Model(inputs=[input_sentence], outputs=[output])
model.compile(optimizer='rmsprop', loss='binary_crossentropy')

model.load_weights('model.h5')

# fit the model to predict what color each person is
model.fit([sentences_array[:len(sentences_array)-1]], [sentences_array[1:]], epochs=10, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
: [ 0.22768036 -0.30545023  0.15748617  0.12116787  0.25562724 -0.14322321
  0.46510711 -0.82245916 -0.00748514 -0.1725014   0.19422317 -0.3540535
  0.04533504 -0.1709715   0.23732799 -0.02119593 -0.01821132 -0.15042126
 -0.20996323 -0.09170409 -0.09941154  0.1342641   0.080317    0.27862796
 -0.22305863 -0.27003989 -0.09139743  0.05239067 -0.09230861  0.39692953
 -0.31664157  0.0885366  -0.00908646 -0.08516192  0.02227044 -0.40142402
 -0.09521891 -0.26441506  0.06257357  0.13762209  0.06840464  0.24984865
  0.23637372 -0.00488208  0.17689338  0.06371383 -0.04009209 -0.26894838
 -0.05601483 -0.23654029 -0.23629937  0.18170148 -0.77354193 -0.6420067
 -0.04488938 -0.17026472 -0.20397688  0.2029203   0.14102919 -0.06707714
 -0.51093739 -0.01170623 -0.07719655  0.01513869 -0.14947674 -0.06247335
 -0.13333124  0.15981765  0.10708557 -0.21898629 -0.15272275 -0.03815776
 -0.29000893 

In [8]:
model.save_weights('model.h5')

In [11]:
model.fit([sentences_array[:len(sentences_array)-1]], [sentences_array[1:]], epochs=40, batch_size=10, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
: [ 0.11421335 -0.34195298  0.14875346  0.05392836  0.12991706 -0.08618486
  0.3999809  -0.76869971  0.08086126 -0.1251906   0.19270097 -0.33295441
  0.01580019 -0.107577    0.22398964  0.01513089  0.01916151 -0.10624117
 -0.21605955 -0.08518127 -0.09194651  0.07388883  0.09137119  0.34587136
 -0.14142758 -0.31441748 -0.11817735  0.06454242 -0.01044466  0.35414076
 -0.29594794  0.06419433  0.0310028  -0.05063775 -0.04320068 -0.39009914
 -0.03575946 -0.3181591   0.00331741  0.05502221 -0.01571499  0.28125241
  0.2991271   0.

In [13]:
print 'done'

done


In [24]:
inp = [0] * 97 + [word2idx['i'], word2idx['love'], word2idx['you']]
model.predict(np.array(inp).reshape(1, 100))

array([[-0.        ,  0.        , -0.        ,  0.        ,  0.00316837,
        -0.        ,  0.        ,  0.        , -0.        , -0.        ,
         0.08428954,  0.28899291,  0.        ,  0.        ,  0.        ,
        -0.31553316,  0.03101788,  0.        ,  0.        , -0.        ,
        -0.        ,  0.        ,  0.13992527, -0.11859532,  0.        ,
         0.        ,  0.        ,  0.01655224, -0.        ,  0.03247942,
        -0.        ,  0.01039564, -0.        ,  0.        ,  0.19620955,
         0.        ,  0.        ,  0.18380126,  0.12647456, -0.        ,
         0.        ,  0.29543972,  0.59864187,  0.29531297,  0.28910464,
        -0.        ,  0.65704238,  0.83109629,  0.65468234,  0.56219828,
        -0.        ,  0.46520701,  0.54404628,  1.        ,  0.70375729,
         0.84097189,  1.        ,  1.        ,  1.        , -0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        , -0.        ,  1. 

## Exporting the Model
This is how you export the model into a json file in order to be imported later. Then you export the model's weights. Later on in other experiments, you could effectively 

In [26]:

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
 

## Importing the Model
This is how you import the model from a json file and the weights so that you don't need to train it every time.

In [None]:

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")