# Language Modeling with Keras

In [17]:
# Handles Importing all the necessary libraries
import itertools
import h5py
import numpy as np
import string
import keras
from keras.layers import Input, Embedding, merge, Flatten, Reshape, Lambda, LSTM, Dropout
from keras.layers.recurrent import SimpleRNN
import keras.backend as K
from keras.models import Model
import sklearn
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 15, 6

### Prepares the Data Set to be used

In [5]:
#imports a massive dataset of sentences found on wikipedia
with open('ignored_assets/lang-train-data-angelas-ashes.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line,
# also strips the punctuation
sentences = [x.strip().translate(None, string.punctuation) for x in content] 

lemma = lambda x: x.strip().lower().split(' ')
sentences_lemmatized = [lemma(sentence) for sentence in sentences]
words = set(itertools.chain(*sentences_lemmatized))
# set(['boy', 'fed', 'ate', 'cat', 'kicked', 'hat'])

# dictionaries for converting words to integers and vice versa
word2idx = dict((v, i) for i, v in enumerate(words))
idx2word = list(words)

# convert the sentences a numpy array
to_idx = lambda x: [word2idx[word] for word in x]
sentences_idx = [to_idx(sentence) for sentence in sentences_lemmatized]
# Sets the maximum word length of each sentence
max_len = 40
# a list of all the indices I remove that are longer than max_len
indices_removed = []
# If the sentence is too long, good by
for i in range(len(sentences_idx)-1, -1, -1):
    if len(sentences_idx[i]) > max_len:
        indices_removed = indices_removed + [i]
        sentences_idx.pop(i)
    elif len(sentences_idx[i]) < max_len:
        sentences_idx[i] = [0] * (max_len - len(sentences_idx[i])) + sentences_idx[i]
print "Indexes Removed: {}".format(len(indices_removed))
print "Number of Sentences: {}".format(len(sentences_idx))
print "Vocabulary: {}".format(len(word2idx))
sentences_array = np.asarray(sentences_idx, dtype='int32')

# Prepares the datasets as input and output
dataX = np.array(sentences_array[:len(sentences_array)-1]).copy()
dataY = np.array(sentences_array[1:]).copy()

# scales the output
scaler = MinMaxScaler(feature_range=(0, 1))
dataY = scaler.fit_transform(dataY)
# reshapes the dataY
new_dataY = dataY.reshape((1,) + dataY.shape)


Indexes Removed: 323
Number of Sentences: 7441
Vocabulary: 6463




## Create and Train The Model

In [18]:
n_words = len(words)
n_embed_dims = 150

# put together a model to predict
input_sentence = Input(shape=(max_len,), dtype='int32')
input_embedding = Embedding(n_words, n_embed_dims)(input_sentence)
output = SimpleRNN(max_len)(input_embedding)

def custom_loss(y_true, y_pred):
    return K.sqrt(K.abs(y_true - y_pred))

model = Model(inputs=[input_sentence], outputs=[output])
model.compile(optimizer=keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0), loss=custom_loss)
model.load_weights('model.h5')


# fit the model to predict what color each person is
model.fit(dataX, dataY, epochs=2, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))
    
print "Evaluation of the Model"
print model.evaluate(dataX, dataY)
print "Saving the Weights"
model.save_weights('model.h5')

Epoch 1/2
Epoch 2/2
: [ 0.01451545  0.02909766 -0.01125624 -0.00711956  0.0109189  -0.00510339
 -0.01580079  0.00141494  0.01698543  0.00884955  0.00315764 -0.0104336
  0.01034257  0.00877095 -0.01010862 -0.02148722 -0.00934169 -0.00330972
 -0.00472692  0.00025703  0.0112626   0.01483593 -0.00788761  0.02443505
 -0.01868114 -0.01152818 -0.03485267  0.03709104 -0.00661962  0.01231158
  0.00197059  0.01476801 -0.01198881  0.00303477  0.00891626 -0.00378854
 -0.00682382  0.03451822  0.01305496 -0.01244588  0.00304619 -0.03807345
 -0.00813417  0.02238745 -0.00720909  0.02882003 -0.01264686  0.01116628
  0.00513404 -0.02124971 -0.00999493  0.01704622  0.00024498 -0.0195619
 -0.00863702 -0.03711805 -0.00237693  0.0025817  -0.02479479 -0.0092837
  0.02818282  0.0244569  -0.00997254  0.00492533 -0.00369392  0.02984793
 -0.01286931 -0.01907092 -0.03457579  0.00827978  0.01107627  0.00301307
 -0.00487189 -0.01417233 -0.00283344 -0.00413371  0.0072307  -0.00080106
  0.01267176 -0.00801167  0.0185

#### Extra Training For The Model

In [19]:
model.fit(dataX, dataY, epochs=30, batch_size=16, verbose=2)
print "  {}".format(model.evaluate(dataX, dataY))
model.save_weights('model.h5')


Epoch 1/30
19s - loss: 0.2247
Epoch 2/30
19s - loss: 0.2210
Epoch 3/30
19s - loss: 0.2180
Epoch 4/30
19s - loss: 0.2159
Epoch 5/30
19s - loss: 0.2141
Epoch 6/30
19s - loss: 0.2126
Epoch 7/30
19s - loss: 0.2113
Epoch 8/30
19s - loss: 0.2102
Epoch 9/30
19s - loss: 0.2092
Epoch 10/30
19s - loss: 0.2083
Epoch 11/30
19s - loss: 0.2076
Epoch 12/30
19s - loss: 0.2069
Epoch 13/30
19s - loss: 0.2062
Epoch 14/30
19s - loss: 0.2056
Epoch 15/30
19s - loss: 0.2051
Epoch 16/30
19s - loss: 0.2045
Epoch 17/30
19s - loss: 0.2041
Epoch 18/30
19s - loss: 0.2036
Epoch 19/30
19s - loss: 0.2032
Epoch 20/30
19s - loss: 0.2028
Epoch 21/30
19s - loss: 0.2023
Epoch 22/30
19s - loss: 0.2021
Epoch 23/30
19s - loss: 0.2019
Epoch 24/30
19s - loss: 0.2013
Epoch 25/30
19s - loss: 0.2011
Epoch 26/30
19s - loss: 0.2008
Epoch 27/30
19s - loss: 0.2004
Epoch 28/30
19s - loss: 0.2002
Epoch 29/30
19s - loss: 0.2000
Epoch 30/30
19s - loss: 0.1997


## Further Training the Model

In [28]:
model.fit(dataX, dataY, epochs=4, batch_size=64, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))

print 'Evaluation: {}'.format(model.evaluate(dataX, dataY))
model.save_weights('model.h5')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
: [ -7.52860755e-02  -4.54015285e-02  -4.58094329e-02   8.84827878e-03
  -8.48692900e-04  -3.17476653e-02  -1.13650709e-02   6.56244531e-02
   6.53840974e-03   6.65657148e-02  -2.31543127e-02  -3.21851298e-02
  -5.29217646e-02   1.71483925e-03   1.58685390e-02   1.47633646e-02
  -3.10055520e-02   2.67946278e-03  -4.41819243e-03   1.17825922e-02
   3.26833390e-02   6.82228524e-03  -2.40664147e-02   1.81059614e-02
   4.85693328e-02   4.03854847e-02   5.97351231e-02   2.43128533e-03
  -2.56353430e-02   1.97669603e-02  -8.07724521e-03   6.48374995e-03
  -6.76449202e-03   3.24921831e-02   8.01629853e-03   9.09802562e-04
  -1.93910580e-02  -2.17047911e-02  -4.88672033e-02  -4.92293611e-02
  -1.37611022e-02   4.78589945e-02   1.62587315e-02  -1.04369866e-02
  -3.82489036e-03  -7.70846556e-04   6.44029975e-02   6.63364481e-04
   3.35666910e-02  -5.23372591e-02   6.23806678e-02  -1.97675209e-02
  -1.34685580e-02  -2.47143134e-02  -3.12603428e-03   3.02405

### Attempt to test the model with some custom inputs

In [24]:
inp = [0] * (max_len - 3) + [word2idx['i'], word2idx['love'], word2idx['you']]
model.predict(np.array(inp).reshape(1, 100))

array([[-0.        ,  0.        , -0.        ,  0.        ,  0.00316837,
        -0.        ,  0.        ,  0.        , -0.        , -0.        ,
         0.08428954,  0.28899291,  0.        ,  0.        ,  0.        ,
        -0.31553316,  0.03101788,  0.        ,  0.        , -0.        ,
        -0.        ,  0.        ,  0.13992527, -0.11859532,  0.        ,
         0.        ,  0.        ,  0.01655224, -0.        ,  0.03247942,
        -0.        ,  0.01039564, -0.        ,  0.        ,  0.19620955,
         0.        ,  0.        ,  0.18380126,  0.12647456, -0.        ,
         0.        ,  0.29543972,  0.59864187,  0.29531297,  0.28910464,
        -0.        ,  0.65704238,  0.83109629,  0.65468234,  0.56219828,
        -0.        ,  0.46520701,  0.54404628,  1.        ,  0.70375729,
         0.84097189,  1.        ,  1.        ,  1.        , -0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        , -0.        ,  1. 

## Exporting the Model
This is how you export the model into a json file in order to be imported later. Then you export the model's weights. Later on in other experiments, you could effectively 

In [26]:

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
 

## Importing the Model
This is how you import the model from a json file and the weights so that you don't need to train it every time.

In [None]:

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")