# Language Modeling with Keras

In [133]:
# Handles Importing all the necessary libraries
import itertools
import h5py
import numpy as np
import string
import keras
from keras.layers import Input, Embedding, merge, Flatten, Reshape, Lambda, LSTM, Dropout, Dense
from keras.layers.recurrent import SimpleRNN
import keras.backend as K
from keras.models import Model
import sklearn
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 15, 6

### Prepares the Data Set to be used

In [131]:
convos = []
# imports datasets
with open('ignored_assets/lang-train-data-simple.txt') as f:
    content = f.readlines()

# takes an array of sentences and splits it into groupings of sentences
def split_into_convo(arr, elem):
    array = arr[:]
    indiv_convos = []
    for i in range(len(arr)-1, -1, -1):
        if(array[i][0] == elem):
            indiv_convos += [array[i+1:]]
            array = array[:i]
    indiv_convos += [array]
    return indiv_convos

# you may also want to remove whitespace characters like `\n` at the end of each line,
# also strips the punctuation
sentences = [x.strip().translate(None, string.punctuation) for x in content] 

lemma = lambda x: x.strip().lower().split(' ')
sentences_lemmatized = [lemma(sentence) for sentence in sentences]
words = set(itertools.chain(*sentences_lemmatized))
# set(['boy', 'fed', 'ate', 'cat', 'kicked', 'hat'])

# dictionaries for converting words to integers and vice versa
word2idx = dict((v, i) for i, v in enumerate(words))
idx2word = list(words)

# convert the sentences a numpy array
to_idx = lambda x: [word2idx[word] for word in x]
sentences_idx = [to_idx(sentence) for sentence in sentences_lemmatized]
# Sets the maximum word length of each sentence
max_len = 9
# a list of all the indices I remove that are longer than max_len
indices_removed = []
# If the sentence is too long, good by
for i in range(len(sentences_idx)-1, -1, -1):
    if len(sentences_idx[i]) > max_len:
        indices_removed = indices_removed + [i]
        sentences_idx.pop(i)
    elif len(sentences_idx[i]) < max_len:
        sentences_idx[i] = sentences_idx[i] + [0] * (max_len - len(sentences_idx[i]));

print "Indexes Removed: {}".format(len(indices_removed))
print "Number of Sentences: {}".format(len(sentences_idx))
print "Vocabulary: {}".format(len(word2idx))
sentences_array = sentences_idx#, dtype='int32'

# splits the array into an array of grouped sentences (conversations)
conversation_array = split_into_convo(sentences_array, word2idx[''])

# Prepares the datasets as input and output
dataX = []
dataY = []
for i in range(len(conversation_array)):
    dataX += conversation_array[i][:len(conversation_array[i])-1]
    dataY += conversation_array[i][1:]
dataX = np.array(dataX).copy().astype('int32')
dataY = np.array(dataY).copy().astype('int32')

# scales the output
scaler = MinMaxScaler(feature_range=(-6, 6))
dataY = scaler.fit_transform(dataY)
# reshapes the dataY
new_dataY = dataY.reshape((1,) + dataY.shape)


Indexes Removed: 0
Number of Sentences: 121
Vocabulary: 142


## Create and Train The Model

In [140]:
n_words = len(words)
n_embed_dims = 5

# put together a model to predict
input_sentence = Input(shape=(max_len,), dtype='int32')
input_embedding = Embedding(n_words, n_embed_dims)(input_sentence)
rnn_layer = SimpleRNN(2*max_len)(input_embedding)
output = Dense(max_len)(rnn_layer)

def custom_loss(y_true, y_pred):
    return K.sqrt(K.abs(y_true - y_pred))

model = Model(inputs=[input_sentence], outputs=[output])
model.compile(optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0), 
            loss='mae')
model.load_weights('model-simple.h5')

model.fit(dataX, dataY, epochs=2, verbose=3)
embeddings = model.layers[1].get_weights()

# print out the embedding vector associated with each word
for i in range(2):
    print('{}: {}'.format(idx2word[i], embeddings[0][i]))
    
print "Evaluation of the Model"
print model.evaluate(dataX, dataY)
print "Saving the Weights"
model.save_weights('model-simple.h5')

Epoch 1/2
Epoch 2/2
: [-0.04496781  0.02772553  0.03370747  0.04692284 -0.00934393]
i’m: [ 0.0102813  -0.01555228 -0.00485859 -0.00076571  0.0036515 ]
Evaluation of the Model


## Extra Training For The Model

In [151]:
model.fit(dataX, dataY, epochs=10000, batch_size=78/2, verbose=0)
print "  {}".format(model.evaluate(dataX, dataY))
print "saving the weights"
model.save_weights('model-simple.h5')

saving the weights


## Exporting the Model
This is how you export the model into a json file in order to be imported later. Then you export the model's weights. Later on in other experiments, you could effectively 

In [6]:
# serialize model to JSON
model_json = model.to_json()
with open("model-simple.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model-simple.h5")

## Importing the Model
This is how you import the model from a json file and the weights so that you don't need to train it every time.

In [None]:
# load json and create model
json_file = open('model-simple.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model-simple.h5")
print("Loaded model from disk")

## Testing the Model with Custom Input

In [150]:
# TODO - Add a check for if the word is in word2idx
# put custom words here
sent = ['i', 'love', 'you', 'too']
custom_in = [0] * max_len
for i in range(len(sent)):
    custom_in[i] = word2idx[sent[i]]
pred = model.predict(np.array(custom_in).reshape(1, max_len) )
print "Pred Values: {}".format(pred)
p = scaler.inverse_transform(pred)
print '\n\n'
for i in range(9):
    if(round(p[0][i]) != 0):
        print idx2word[ int(round(p[0][i]))]

Pred Values: [[  8.00635223e+01   7.46468735e+01   5.68069305e+01   5.83343811e+01
    3.00547719e+00   1.45626739e-02   4.47865576e-03   8.82884860e-03
   -1.67539306e-02]]





IndexError: list index out of range