# Language Modeling with Keras

In [7]:
# Handles Importing all the necessary libraries
import itertools
import h5py
import numpy as np
import string
import keras
from keras.layers import Input, Embedding, merge, Flatten, Reshape, Lambda, LSTM, Dropout
from keras.layers.recurrent import SimpleRNN
import keras.backend as K
from keras.models import Model
import sklearn
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 15, 6

### Prepares the Data Set to be used

In [33]:
#imports a massive dataset of sentences found on wikipedia
with open('ignored_assets/lang-train-data-angelas-ashes.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line,
# also strips the punctuation
sentences = [x.strip().translate(None, string.punctuation) for x in content] 

lemma = lambda x: x.strip().lower().split(' ')
sentences_lemmatized = [lemma(sentence) for sentence in sentences]
words = set(itertools.chain(*sentences_lemmatized))
# set(['boy', 'fed', 'ate', 'cat', 'kicked', 'hat'])

# dictionaries for converting words to integers and vice versa
word2idx = dict((v, i) for i, v in enumerate(words))
idx2word = list(words)

# convert the sentences a numpy array
to_idx = lambda x: [word2idx[word] for word in x]
sentences_idx = [to_idx(sentence) for sentence in sentences_lemmatized]
# Sets the maximum word length of each sentence
max_len = 40
# a list of all the indices I remove that are longer than max_len
indices_removed = []
# If the sentence is too long, good by
for i in range(len(sentences_idx)-1, -1, -1):
    if len(sentences_idx[i]) > max_len:
        indices_removed = indices_removed + [i]
        sentences_idx.pop(i)
    elif len(sentences_idx[i]) < max_len:
        sentences_idx[i] = sentences_idx[i] + [0] * (max_len - len(sentences_idx[i]));

print "Indexes Removed: {}".format(len(indices_removed))
print "Number of Sentences: {}".format(len(sentences_idx))
print "Vocabulary: {}".format(len(word2idx))
sentences_array = np.asarray(sentences_idx, dtype='int32')

# Prepares the datasets as input and output
dataX = np.array(sentences_array[:len(sentences_array)-1]).copy()
dataY = np.array(sentences_array[1:]).copy()

# scales the output
scaler = MinMaxScaler(feature_range=(-1, 1))
dataY = scaler.fit_transform(dataY)
# reshapes the dataY
new_dataY = dataY.reshape((1,) + dataY.shape)


Indexes Removed: 323
Number of Sentences: 7441
Vocabulary: 6463


## Create and Train The Model

In [27]:
n_words = len(words)
n_embed_dims = 150

# put together a model to predict
input_sentence = Input(shape=(max_len,), dtype='int32')
input_embedding = Embedding(n_words, n_embed_dims)(input_sentence)
output = SimpleRNN(max_len)(input_embedding)

def custom_loss(y_true, y_pred):
    return K.sqrt(K.abs(y_true - y_pred))

model = Model(inputs=[input_sentence], outputs=[output])
model.compile(optimizer=keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0), 
            loss='mae')
model.load_weights('model.h5')


# fit the model to predict what color each person is
model.fit(dataX, dataY, epochs=2, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))
    
print "Evaluation of the Model"
print model.evaluate(dataX, dataY)
print "Saving the Weights"
model.save_weights('model.h5')

Epoch 1/2
Epoch 2/2
: [  4.00357693e-02  -1.41882775e-02  -1.69515628e-02   2.72160443e-03
  -5.19732479e-03  -1.00261415e-03  -1.50458759e-03  -8.53088452e-04
   3.68796661e-02  -1.40176332e-02   5.93608897e-03  -9.12672877e-02
   5.74352918e-04  -1.37870992e-03  -3.51639086e-04  -3.04035237e-03
  -2.64175492e-03   7.65870034e-04  -6.68601692e-03   7.05517596e-03
  -1.28488708e-02   7.99107831e-04  -3.54190677e-04   2.08475478e-02
  -8.44246184e-04  -6.08783308e-03   6.53023235e-05  -7.27649487e-04
  -1.68303653e-04   2.02140328e-03   5.04109485e-04  -3.82608129e-03
   3.13007794e-02  -5.59197040e-03  -2.94063194e-03   1.26788300e-03
   6.46054521e-02   5.48614655e-03  -6.92775520e-03  -4.59323497e-03
  -6.52802782e-03  -2.57798261e-03  -5.29891849e-02   2.96514574e-02
   4.69198544e-03  -1.22453077e-02  -8.46719649e-03  -1.40683842e-03
  -2.48018210e-03   4.01063962e-03  -3.40918475e-03  -1.44209759e-03
   2.53453050e-02  -4.23171977e-03   3.36540528e-02   8.69504921e-03
   1.4365701

#### Extra Training For The Model

In [30]:
model.fit(dataX, dataY, epochs=50, batch_size=16, verbose=2)
print "  {}".format(model.evaluate(dataX, dataY))
print "saving the weights"
model.save_weights('model.h5')

Epoch 1/50
22s - loss: 0.0907
Epoch 2/50
22s - loss: 0.0907
Epoch 3/50
22s - loss: 0.0906
Epoch 4/50
21s - loss: 0.0906
Epoch 5/50
21s - loss: 0.0905
Epoch 6/50
22s - loss: 0.0905
Epoch 7/50
22s - loss: 0.0904
Epoch 8/50
21s - loss: 0.0904
Epoch 9/50
21s - loss: 0.0903
Epoch 10/50
22s - loss: 0.0903
Epoch 11/50
21s - loss: 0.0903
Epoch 12/50
22s - loss: 0.0902
Epoch 13/50
21s - loss: 0.0901
Epoch 14/50
21s - loss: 0.0901
Epoch 15/50
22s - loss: 0.0900
Epoch 16/50
21s - loss: 0.0900
Epoch 17/50
22s - loss: 0.0900
Epoch 18/50
22s - loss: 0.0899
Epoch 19/50
22s - loss: 0.0899
Epoch 20/50
23s - loss: 0.0898
Epoch 21/50
23s - loss: 0.0898
Epoch 22/50
22s - loss: 0.0897
Epoch 23/50
22s - loss: 0.0897
Epoch 24/50
22s - loss: 0.0897
Epoch 25/50
22s - loss: 0.0896
Epoch 26/50
22s - loss: 0.0896
Epoch 27/50
22s - loss: 0.0895
Epoch 28/50
22s - loss: 0.0895
Epoch 29/50
22s - loss: 0.0894
Epoch 30/50
22s - loss: 0.0894
Epoch 31/50
22s - loss: 0.0893
Epoch 32/50
22s - loss: 0.0893
Epoch 33/50
22s -

## Further Training the Model

In [28]:
model.fit(dataX, dataY, epochs=4, batch_size=64, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))

print 'Evaluation: {}'.format(model.evaluate(dataX, dataY))
model.save_weights('model.h5')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
: [ -7.52860755e-02  -4.54015285e-02  -4.58094329e-02   8.84827878e-03
  -8.48692900e-04  -3.17476653e-02  -1.13650709e-02   6.56244531e-02
   6.53840974e-03   6.65657148e-02  -2.31543127e-02  -3.21851298e-02
  -5.29217646e-02   1.71483925e-03   1.58685390e-02   1.47633646e-02
  -3.10055520e-02   2.67946278e-03  -4.41819243e-03   1.17825922e-02
   3.26833390e-02   6.82228524e-03  -2.40664147e-02   1.81059614e-02
   4.85693328e-02   4.03854847e-02   5.97351231e-02   2.43128533e-03
  -2.56353430e-02   1.97669603e-02  -8.07724521e-03   6.48374995e-03
  -6.76449202e-03   3.24921831e-02   8.01629853e-03   9.09802562e-04
  -1.93910580e-02  -2.17047911e-02  -4.88672033e-02  -4.92293611e-02
  -1.37611022e-02   4.78589945e-02   1.62587315e-02  -1.04369866e-02
  -3.82489036e-03  -7.70846556e-04   6.44029975e-02   6.63364481e-04
   3.35666910e-02  -5.23372591e-02   6.23806678e-02  -1.97675209e-02
  -1.34685580e-02  -2.47143134e-02  -3.12603428e-03   3.02405

### Attempt to test the model with some custom inputs

In [24]:
inp = [0] * (max_len - 3) + [word2idx['i'], word2idx['love'], word2idx['you']]
model.predict(np.array(inp).reshape(1, 100))

array([[-0.        ,  0.        , -0.        ,  0.        ,  0.00316837,
        -0.        ,  0.        ,  0.        , -0.        , -0.        ,
         0.08428954,  0.28899291,  0.        ,  0.        ,  0.        ,
        -0.31553316,  0.03101788,  0.        ,  0.        , -0.        ,
        -0.        ,  0.        ,  0.13992527, -0.11859532,  0.        ,
         0.        ,  0.        ,  0.01655224, -0.        ,  0.03247942,
        -0.        ,  0.01039564, -0.        ,  0.        ,  0.19620955,
         0.        ,  0.        ,  0.18380126,  0.12647456, -0.        ,
         0.        ,  0.29543972,  0.59864187,  0.29531297,  0.28910464,
        -0.        ,  0.65704238,  0.83109629,  0.65468234,  0.56219828,
        -0.        ,  0.46520701,  0.54404628,  1.        ,  0.70375729,
         0.84097189,  1.        ,  1.        ,  1.        , -0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        , -0.        ,  1. 

## Exporting the Model
This is how you export the model into a json file in order to be imported later. Then you export the model's weights. Later on in other experiments, you could effectively 

In [6]:

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
 

## Importing the Model
This is how you import the model from a json file and the weights so that you don't need to train it every time.

In [None]:

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")