In [None]:
import numpy as np
import common
versePairs = common.loadTrainingData()
X, Y = common.cleanAndSplitVerses(versePairs)
Xenc, Yenc = common.encXandY(X, Y)
# Use this to check that regexClean and regexUnclean are perfect inverses on the training data.
# If the result is non empty you probably need to clean up the corresponding lines of your training set.
assert len([v for v in versePairs if common.regexUnclean(common.regexClean(v)) != v]) == 0
maxlen = 100
Xnp, Ynp = common.padXandY(Xenc, Yenc, maxlen)

In [None]:
# Make all the training sequences the same length.
from keras.preprocessing.sequence import pad_sequences
Xnp = pad_sequences(Xenc)
Ynp = pad_sequences(Yenc, value=[1,0,0,0])

# Define the model. Uncomment this if you don't have the saved model available.
from keras.models import Sequential, Model
from keras.layers import LSTM, GRU, Dense, TimeDistributed, Bidirectional, Input, Embedding
from keras.layers.merge import Concatenate
from keras.layers.core import Dropout


model = Sequential()
model.add(Bidirectional(GRU(256, return_sequences=True, dropout=0.25), input_shape=(191, 31)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(4, activation='sigmoid', use_bias=False)))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', sample_weight_mode='temporal', metrics=['categorical_accuracy'])

#from keras.models import load_model
#model = load_model("saved_1step_model")

model.summary()

In [None]:
# Generate a weighting for the different characters. We want to penalise the model according to how rare a symbol is
# so that rare symbols are more important to place correctly than common ones.
import numpy as np
from collections import Counter
Yclass = np.argmax(Ynp, axis=2)
chars = Counter("".join(Y))
freq = chars.values()
total = np.sum(list(freq))
toReplace = {}
toReplace[0] = total/chars["0"]
toReplace[1] = total/chars["|"]
toReplace[2] = total/chars["·"]
toReplace[3] = total/chars["*"]
def replace(clas):
    return toReplace[clas]
sample_weight = np.vectorize(replace)(Yclass)

In [None]:
# A callback to display a particular verse after each epoch.
from keras.callbacks import Callback
class ShowVerse(Callback):
    def __init__(self, verse=0):
        self.verse = verse
    def on_epoch_end(self, batch, logs={}):
        pred = self.model.predict_classes(Xnp[self.verse:self.verse+1])
        toComb = common.decClasses(pred[0])
        print(common.regexUnclean(common.mergeStrings(X[self.verse], toComb)))
        pred = self.model.predict(Xnp[self.verse:self.verse+1], batch_size=256)
        toComb = common.getToComb(pred[0])
        print(common.regexUnclean(common.mergeStrings(X[self.verse], toComb)))

In [None]:
# Train the model for an amount of time. You can interrupt this process with Kernel-> Interrupt and when
# you run this again training will continue from when you left off (unless you ran the define the model code again.)
model.fit(Xnp, Ynp, epochs=10, batch_size=256, sample_weight=sample_weight, validation_split=0.1, callbacks=[])
# Save the model
model.save("saved_1step_model_gru")

In [None]:
pred = model.predict_classes(Xnp, batch_size=256)
for i in range(pred.shape[0]):
    toComb = common.decClasses(pred[i])
    print(common.regexUnclean(common.mergeStrings(X[i], toComb)))

In [None]:
pred = model.predict(Xnp, batch_size=256)
for i in range(pred.shape[0]):
    toComb = common.getToComb(pred[i])
    print(common.regexUnclean(common.mergeStrings(X[i], toComb)))

In [None]:
# Load and predict on some test data.
from keras.preprocessing.sequence import pad_sequences
test = []
with open("testCleaned.txt", 'r', encoding="utf-8") as file:
    for line in file:
        test.append(line[:-1])
testEnc = [common.encString(l) for l in test]
testNp = pad_sequences(testEnc, maxlen=191)
testPred = model.predict_classes(testNp, batch_size=256)
for i in range(len(test)):
    toComb = common.decClasses(testPred[i])
    print(common.regexUnclean(common.mergeStrings(test[i], toComb)))