In [55]:
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Concatenate, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences
from BahdanauAttention import AttentionLayer
import numpy as np
import random
import pickle

In [95]:
class s2s_model:
    def __init__(self, max_encoder_len, max_decoder_len, num_encoder_vocab, num_decoder_vocab):
        self.latent_dim = 64
        self.embedding_dim = 64
        self.max_encoder_len = max_encoder_len
        self.max_decoder_len = max_decoder_len
        self.num_encoder_vocab = num_encoder_vocab
        self.num_decoder_vocab = num_decoder_vocab
        
        self.build_encoder()
        self.build_decoder()
        
        self.training_model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs)
        
    def build_encoder(self):
        self.encoder_inputs = Input(shape=(self.max_encoder_len, ))
        self.encoder_embed = Embedding(self.num_encoder_vocab, self.embedding_dim, trainable=True)(self.encoder_inputs)
        self.encoder_LSTM1 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout = 0.4, recurrent_dropout = 0.3)
        self.encoder_output1, self.state_h1, self.state_c1 = self.encoder_LSTM1(self.encoder_embed)

        self.encoder_LSTM2 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.3)
        self.encoder_output2, self.state_h2, self.state_c2 = self.encoder_LSTM2(self.encoder_output1) # encoder LSTMs feed into each other

        self.encoder_LSTM3 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.3)
        self.encoder_output, self.state_h, self.state_c = self.encoder_LSTM3(self.encoder_output2) # final outputs and states to pass to decoder LSTM
        
    def build_decoder(self):
        self.decoder_inputs = Input(shape=(None,))

        # define layer architecture, then match to inputs
        self.decoder_embed_layer = Embedding(self.num_decoder_vocab, self.embedding_dim, trainable=True)
        self.decoder_embed = self.decoder_embed_layer(self.decoder_inputs)

        # decoder LSTM layer
        self.decoder_LSTM = LSTM(self.latent_dim, return_sequences=True, return_state= True, dropout=0.4, recurrent_dropout=0.2)
        self.decoder_outputs, self.decoder_fwd_state, self.decoder_back_state = self.decoder_LSTM(self.decoder_embed, initial_state=[self.state_h, self.state_c])

        # dense layer (output layer)
        # keras.layers.TimeDistributed layer considers temporal dimension
        # Every input should be at least 3D, and the dimension of index one of the first input will be considered to be the temporal dimension.
        self.decoder_dense = TimeDistributed(Dense(self.num_decoder_vocab, activation='softmax'))
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        
    def compile(self):
        self.training_model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics = ['acc'])
        
    def fit(self, x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep, batch_size):
        tb = TensorBoard(log_dir="logs/")
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
        ck = ModelCheckpoint(filepath='data/autoencoder_best_weights.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        Callbacks = [es, ck]
        self.training_model.fit([x_tr,y_tr_in], y_tr_out, epochs = ep, callbacks=Callbacks, batch_size = batch_size, validation_data=(([x_test,y_test_in]), y_test_out))
    
    def build_inference_model(self):
        self.inference_encoder_model = Model(inputs= self.encoder_inputs, outputs=[self.encoder_output, self.state_h, self.state_c])

        # decoder setup
        self.decoder_state_input_h = Input(shape=(self.latent_dim,))
        self.decoder_state_input_c = Input(shape=(self.latent_dim,))
        self.decoder_hidden_state_input = Input(shape=(self.max_encoder_len, self.latent_dim))

        self.decoder_embed_i = self.decoder_embed_layer(self.decoder_inputs)

        self.decoder_output_i, self.state_h_i, self.state_c_i = self.decoder_LSTM(self.decoder_embed_i, initial_state = [self.decoder_state_input_h, self.decoder_state_input_c])

        self.decoder_output_i = self.decoder_dense(self.decoder_output_i)

        # final decoder inference model
        self.inference_decoder_model = Model([self.decoder_inputs] + [self.decoder_hidden_state_input, self.decoder_state_input_h, self.decoder_state_input_c], [self.decoder_output_i] + [self.state_h_i, self.state_c_i])
        
    def decode_sequence(self, input_seq, i2o, o2i):
        e_out,e_h, e_c = self.inference_encoder_model.predict(input_seq, verbose = 0)
        target_seq = np.zeros((1,1))
        target_seq[0,0] = o2i['<']

        stop_condition = False
        decoded_sentence = []

        while not stop_condition:
            (output_tokens, h, c) = self.inference_decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose = 0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_token = i2o[sampled_token_index]   

            if sampled_token != '>':
                decoded_sentence += [sampled_token]

            # Exit condition: either hit max length or find the stop word.
            if (sampled_token == '>') or (len(decoded_sentence) >= self.max_decoder_len):
                stop_condition = True

            # Update the target sequence (of length 1)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            # Update internal states
            (e_h, e_c) = (h, c)
        return decoded_sentence
    def word2seq(self, a2i, input_word):
        final_seq = []
        for c in input_word:
            final_seq += [a2i[c]]
        final_seq = pad_sequences([final_seq], maxlen=self.max_encoder_len, padding='post')[0]
        return final_seq
    
    def translate(self, input_word, a2i, i2o, o2i):
        seq = self.word2seq(a2i, input_word).reshape(1, self.max_encoder_len)
        return self.decode_sequence(seq, i2o, o2i)

In [57]:
def insert_syl(word, indexes):
        index_list = np.where(np.array(indexes) == 2)[0]
        word_array = [*word]
        for i in range(0, len(index_list)):
            word_array.insert(index_list[i] + i + 1, '-')
        return ''.join(word_array)

In [58]:
# grab english words
# grab one hot encoding of syllable boundaries
# shuffle indexes of one hots per word
# X = randomized indexes inserted into word
# true Y = true indexes inserted into word

In [59]:
with open('data/ox/x_tr_ortho.pkl', 'rb') as file:
    x_tr_ortho = pickle.load(file)
with open('data/ox/x_val_ortho.pkl', 'rb') as file:
    x_val_ortho = pickle.load(file)
with open('data/ox/e2i_ortho.pkl', 'rb') as file:
    e2i_ortho = pickle.load(file)
    i2e_ortho = {v: k for k, v in e2i_ortho.items()}
with open('data/ox/y_tr.pkl', 'rb') as file:
    y_tr = pickle.load(file)
with open('data/ox/y_val.pkl', 'rb') as file:
    y_val = pickle.load(file)
    
pure_english_train = [''.join([i2e_ortho[c] for c in word if c != 0]) for word in x_tr_ortho]
pure_english_val = [''.join([i2e_ortho[c] for c in word if c != 0]) for word in x_val_ortho]
true_indexes_tr = y_tr
true_indexes_val = y_val

syl_train_y = ['<' + insert_syl(pure_english_train[i], true_indexes_tr[i]) + '>' for i in range(0, len(pure_english_train))]
syl_val_y = ['<' + insert_syl(pure_english_val[i], true_indexes_val[i]) + '>' for i in range(0, len(pure_english_val))]

In [60]:
# generating noisy data

syl_train_x = ['<' + insert_syl(pure_english_train[i], shuffle_indexes(true_indexes_tr[i])) + '>' for i in range(0, len(pure_english_train))]
syl_val_x = ['<' + insert_syl(pure_english_val[i], shuffle_indexes(true_indexes_val[i])) + '>' for i in range(0, len(pure_english_val))]

In [61]:
# padding to 16

def shuffle_indexes(indexes):
    non_pad = [c for c in indexes if c != 0]
    random.shuffle(non_pad)
    repad = pad_sequences([non_pad], maxlen = 16, padding = 'post', value=0)[0]
    return repad

In [65]:
# generating metadata and dictionaries
# encoder data
max_encoder_len = 0
max_decoder_len = 0

encoder_vocab = []
decoder_vocab = []

for line in syl_train_x:
    max_encoder_len = max(max_encoder_len, len(line))
    for c in line:
        if c not in encoder_vocab:
            encoder_vocab += [c]
            
for line in syl_val_x:
    max_encoder_len = max(max_encoder_len, len(line))
    for c in line:
        if c not in encoder_vocab:
            encoder_vocab += [c]
            
# decoder metadata

for line in syl_train_y:
    max_decoder_len = max(max_decoder_len, len(line))
    for c in line:
        if c not in decoder_vocab:
            decoder_vocab += [c]

for line in syl_val_y:
    max_decoder_len = max(max_decoder_len, len(line))
    for c in line:
        if c not in decoder_vocab:
            decoder_vocab += [c]
            
e2i_s2s = {c:i for i,c in enumerate(encoder_vocab)}
d2i_s2s = {c:i for i,c in enumerate(decoder_vocab)}

i2e_s2s = {v: k for k, v in e2i_s2s.items()}
i2d_s2s = {v: k for k, v in d2i_s2s.items()}

print(max_encoder_len)
print(max_decoder_len)

25
25


In [67]:
#encoding data to ints
x_tr = [[e2i_s2s[c] for c in line] for line in syl_train_x]
y_tr = [[d2i_s2s[c] for c in line] for line in syl_train_y]
x_val = [[e2i_s2s[c] for c in line] for line in syl_val_x]
y_val = [[d2i_s2s[c] for c in line] for line in syl_val_y]

x_tr = pad_sequences(x_tr, maxlen = max_encoder_len, padding = 'post', value=0)
y_tr = pad_sequences(y_tr, maxlen = max_decoder_len, padding = 'post', value=0)
x_val = pad_sequences(x_val, maxlen = max_encoder_len, padding = 'post', value=0)
y_val = pad_sequences(y_val, maxlen = max_decoder_len, padding = 'post', value=0)

In [68]:
with open('data/x_tr_auto.pkl', 'wb') as file:
    pickle.dump(x_tr, file)
with open('data/y_tr_auto.pkl', 'wb') as file:
    pickle.dump(y_tr, file)
with open('data/x_val_auto.pkl', 'wb') as file:
    pickle.dump(x_val, file)
with open('data/y_val_auto.pkl', 'wb') as file:
    pickle.dump(y_val, file)

In [69]:
print(len(encoder_vocab))
print(len(decoder_vocab))

37
37


In [96]:
# max_encoder_len = 25
# max_decoder_len = 25
# num_encoder_vocab = 37
# num_decoder_vocab = 37

autoencoder = s2s_model(max_encoder_len, max_decoder_len, len(encoder_vocab), len(decoder_vocab))

In [85]:
# 80:20 training testing split
split_index = int(len(x_tr) * .8)
x_test = x_tr[split_index:]
y_test = y_tr[split_index:]

x_tr = x_tr[:split_index]
y_tr = y_tr[:split_index]

y_tr_in = np.array(y_tr)[:, :-1]
y_tr_out = np.array(y_tr)[:, 1:]

y_test_in = np.array(y_test)[:, :-1]
y_test_out = np.array(y_test)[:, 1:]

In [97]:
autoencoder.compile()
autoencoder.fit(x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, 50, 128)

Epoch 1/50
Epoch 1: val_acc improved from -inf to 0.66823, saving model to data\autoencoder_best_weights.h5
Epoch 2/50
Epoch 2: val_acc improved from 0.66823 to 0.68709, saving model to data\autoencoder_best_weights.h5
Epoch 3/50
Epoch 3: val_acc improved from 0.68709 to 0.70938, saving model to data\autoencoder_best_weights.h5
Epoch 4/50
Epoch 4: val_acc improved from 0.70938 to 0.71677, saving model to data\autoencoder_best_weights.h5
Epoch 5/50
Epoch 5: val_acc improved from 0.71677 to 0.73284, saving model to data\autoencoder_best_weights.h5
Epoch 6/50
Epoch 6: val_acc improved from 0.73284 to 0.74157, saving model to data\autoencoder_best_weights.h5
Epoch 7/50
Epoch 7: val_acc improved from 0.74157 to 0.74833, saving model to data\autoencoder_best_weights.h5
Epoch 8/50
Epoch 8: val_acc improved from 0.74833 to 0.75298, saving model to data\autoencoder_best_weights.h5
Epoch 9/50
Epoch 9: val_acc improved from 0.75298 to 0.75366, saving model to data\autoencoder_best_weights.h5
Epoc

In [98]:
autoencoder.build_inference_model()

In [105]:
for word in syl_train_x[:5]:
    print(word)
    print(''.join(autoencoder.translate(word, e2i_s2s, i2d_s2s, d2i_s2s)))

<u-n-tying>
an-ti-a-tis
<op-us>
o-ran
<incor-ru-pt-ibl-e>
in-ter-ta-tion-al
<co-r-oner>
cor-e-tent
<subtropi-c-s>
su-tare-lous


In [106]:
print(syl_train_x[:3])
print(syl_train_y[:3])

['<u-n-tying>', '<op-us>', '<incor-ru-pt-ibl-e>']
['<un-ty-ing>', '<o-pus>', '<in-cor-rupt-i-ble>']
