# Spelling Bee Challenge with RNN

- Uses seq2seq model

[reference](https://github.com/mikesj-public/rnn_spelling_bee/blob/master/spelling_bee_RNN.ipynb)

## Preprocessing

**what do we need?**

- Dataset : Training and test set (x's and y's)
- Phonemes : index to phoneme and phoneme to index dictionaries
- Alphabets : index to alphabets and alphabets to index dictionaries 

In [1]:
START_LINE = 126
END_LINE = 133905
FILENAME = 'data/cmudict-0.7b'
lines = open(FILENAME, 'r', encoding='utf-8', errors='ignore').read().split('\n')[START_LINE:END_LINE]

In [2]:
import random
random.choice(lines)

'BENITES  B EH1 N AY0 T S'

In [3]:
phonemes = []
words = []

for line in lines:
    word, pronounce = line.split('  ')
    phonemes.append(pronounce.split(' '))
    words.append(word)

In [4]:
print(random.choice(words))
print(random.choice(phonemes))

WONDERFULNESS
['F', 'EY1', 'K', 'ER0', 'Z']


In [5]:
phoneme_vocab = set([item for row in phonemes for item in row])
phoneme_vocab = ['_'] + sorted(list(phoneme_vocab))

In [6]:
phoneme_vocab_size = len(phoneme_vocab)

In [7]:
# need idx2phoneme and phoneme2idx
idx2phoneme = dict(enumerate(phoneme_vocab))
phoneme2idx = dict(zip(idx2phoneme.values(), idx2phoneme.keys()))

In [8]:
print(idx2phoneme[34])
print(phoneme2idx['N'])

HH
45


In [9]:
# aphabets
idx2alpha = dict(enumerate('_abcdefghijklmnopqrstuvwxyz'))
alpha2idx = dict(zip(idx2alpha.values(), idx2alpha.keys()))
alpha_vocab_len = len('_abcdefghijklmnopqrstuvwxyz')

In [10]:
# - word to phoneme_index dictionaries
# - remember words and phonemes? we are gonna use them here
# - lets also remove too small (<5 characters) or too large (>15 characters) words
word2phoneme_idx = {}
for word, phoneme_list in zip(words, phonemes):
    if len(word) > 5 and len(word) < 15 and len(phoneme_list) < 16:
        word2phoneme_idx[word.lower()] = [phoneme2idx[phoneme] for phoneme in phoneme_list]

### Typically the final stage of preprocessing
- convert dataset to numpy arrays filled with indices instead of characters, with padding
- split into training, validation, test sets

In [None]:
import numpy as np

updated_words = word2phoneme_idx.keys()
dataset_len = len(updated_words)

# empty numpy arrays to hold the indices
dataX = np.zeros([dataset_len,16])
dataX = np.zeros([dataset_len,15])

for i,word in enumerate(updated_words):
    phoneme_list = word2phoneme_idx[word]
    # add items to dataX and dataY with padding
    for j, n in 
    

In [11]:
import numpy as np

pairs = np.random.permutation(list(word2phoneme_idx.keys()))

input_ = np.zeros((len(pairs), 16))
labels_ = np.zeros((len(pairs), 15))

for i, k in enumerate(pairs):
    v = word2phoneme_idx[k]
    k = k + "_" * (15 - len(k))
    v = v + [0] * (16 - len(v))
    
    for j, n in enumerate(v):
        input_[i][j] = n
    for j, letter in enumerate(k):
        if letter in alpha2idx:
            labels_[i][j] = alpha2idx[letter]
        
input_ = input_.astype(np.int32)
labels_ = labels_.astype(np.int32)

input_test   = input_[:10000]
input_val    = input_[10000:20000]
input_train  = input_[20000:]
labels_test  = labels_[:10000]
labels_val   = labels_[10000:20000]
labels_train = labels_[20000:]

data_test  = zip(input_test, labels_test)
data_val   = zip(input_val, labels_val)
data_train = zip(input_train, labels_train)

In [12]:
print(input_test[124], labels_test[124])

[33 11 54 21 38  0  0  0  0  0  0  0  0  0  0  0] [ 7 15 18  4  9  5  0  0  0  0  0  0  0  0  0]


## Moving on to Tensorflow

In [13]:
import tensorflow as tf

In [14]:
xseq_len = 16
yseq_len = 15
batch_size = 128
xvocab_size = 70
yvocab_size = 28
emb_dim = 128

In [15]:
tf.reset_default_graph()

In [16]:
x_ = [ tf.placeholder(tf.int32, shape=[None,], name='x{}'.format(i)) for i in range(xseq_len)]
y_ = [ tf.placeholder(tf.int32, shape=[None,], name='y{}'.format(i)) for i in range(yseq_len)]
decoder_inputs = [tf.zeros_like(x_[0], dtype=tf.int32, name = "GO")] + y_[:-1]
keep_prob = tf.placeholder(tf.float32)
basic_cell = tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(emb_dim),
        output_keep_prob=keep_prob)
stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([basic_cell]*3)


with tf.variable_scope('decoder') as scope:
    decode_outputs, decode_states = tf.nn.seq2seq.embedding_rnn_seq2seq(x_,decoder_inputs, stacked_lstm,
                                        xvocab_size, yvocab_size, emb_dim)
    scope.reuse_variables()
    # testing
    decode_outputs_test, decode_states_test = tf.nn.seq2seq.embedding_rnn_seq2seq(
        x_, decoder_inputs, stacked_lstm, xvocab_size, yvocab_size,emb_dim,
        feed_previous=True)



In [17]:
# we weight the losses based on timestep of decoder output
loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in y_] # gives [1, 1, ..., 1,1] - equal weights
loss = tf.nn.seq2seq.sequence_loss(decode_outputs, y_, loss_weights, yvocab_size)
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)

In [30]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.iter = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = next(self.iter)
        except StopIteration:
            self.iter = self.make_random_iter()
            idxs = next(iter)
        X, Y = zip(*[self.data[i] for i in idxs])
        X = np.array(X).T
        Y = np.array(Y).T
        return X, Y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(list(self.data)), self.batch_size)
        it = np.split(np.random.permutation(range(len(list(self.data)))), splits)[:-1]
        return iter(it)
    
train_iter = DataIterator(data_train, 128)
val_iter = DataIterator(data_val, 128)
test_iter = DataIterator(data_test, 128)

In [18]:
def get_feed(X, Y):
    feed_dict = {encode_input[t]: X[t] for t in range(input_seq_length)}
    feed_dict.update({labels[t]: Y[t] for t in range(output_seq_length)})
    return feed_dict

def train_batch(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 0.5
    _, out = sess.run([train_op, loss], feed_dict)
    return out

In [19]:
def get_eval_batch_data(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 1.
    all_output = sess.run([loss] + decode_outputs_test, feed_dict)
    eval_loss = all_output[0]
    decode_output = np.array(all_output[1:]).transpose([1,0,2])
    return eval_loss, decode_output, X, Y

def eval_batch(data_iter, num_batches):
    losses = []
    predict_loss = []
    for i in range(num_batches):
        eval_loss, output, X, Y = get_eval_batch_data(data_iter)
        losses.append(eval_loss)
        
        for index in range(len(output)):
            real = Y.T[index]
            predict = np.argmax(output, axis = 2)[index]
            predict_loss.append(all(real==predict))
    return np.mean(losses), np.mean(predict_loss)

In [21]:
import sys

In [31]:
for i in range(100000):
    try:
        train_batch(train_iter)
        if i % 1000 == 0:
            val_loss, val_predict = eval_batch(val_iter, 16)
            train_loss, train_predict = eval_batch(train_iter, 16)
            print("val loss   : {0}, val predict   = {1}".format(val_loss, val_predict * 100))
            print("train loss : {0}, train predict = {1}".format(train_loss, train_predict * 100))
            print
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("interrupted by user")
        break

TypeError: 'builtin_function_or_method' object is not an iterator

In [32]:
import pickle

In [36]:
a = np.arange(12).reshape([4,3])

In [50]:
import data, data_utils
import importlib as I

In [66]:
I.reload(data_utils)

<module 'data_utils' from '/home/suriya/_/tf/TF/simple-seq2seq/data_utils.py'>

In [39]:
data_ctl, idx_words, idx_phonemes = data.load_data()

In [67]:
(trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_words, idx_phonemes)

In [46]:
trainX[0]

array([ 8,  5, 12, 20, 26,  5, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [55]:
test_batch_gen = data_utils.batch_gen(testX, testY, 32)

In [56]:
batchX, batchY = test_batch_gen.__next__()

In [59]:
batchX, batchY

(array([[ 3, 15, 12, 20, 18,  1,  9, 14,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  5, 18,  5, 14, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  8,  5, 13, 12,  1, 23, 14,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3, 12,  1, 25, 13, 15, 14, 20,  0,  0,  0,  0,  0,  0,  0,  0],
        [13, 21,  7, 14,  9, 25,  1,  8,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4, 21,  3, 11, 23,  1, 12, 12,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 14,  4, 18,  5,  1,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2, 12, 15, 15, 13,  9, 14,  7,  4,  1, 12,  5,  0,  0,  0,  0],
        [ 1, 19, 16,  9, 18,  1, 14, 20,  0,  0,  0,  0,  0,  0,  0,  0],
        [15, 21, 20, 19, 11,  9, 18, 20, 19,  0,  0,  0,  0,  0,  0,  0],
        [ 2, 18, 15,  4, 14,  1, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [19, 23,  5, 12, 20, 18, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [23,  8,  9, 19, 20, 12,  5, 18, 19,  0,  0,  0,  0,  0,  0,  0],
        [18,  5, 19,  9,  7, 14,  5,  

In [68]:
print(trainX.shape, testX.shape, validX.shape)

(49470, 16) (10600, 16) (10600, 16)


In [61]:
a = list(range(10))

In [72]:
def get_feed(X, Y):
    feed_dict = {x_[t]: X[t] for t in range(xseq_len)}
    feed_dict.update({y_[t]: Y[t] for t in range(yseq_len)})
    return feed_dict

In [70]:
train_batch_gen = data_utils.batch_gen(trainX, trainY, batch_size)
batchX, batchY = train_batch_gen.__next__()

In [73]:
feed_dict = get_feed(batchX, batchY)

In [74]:
feed_dict[keep_prob] = 0.5

In [77]:
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    _, out = sess.run([train_op, loss], feed_dict)

InvalidArgumentError: indices[0] = 68 is not in [0, 28)
	 [[Node: decoder/embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder/embedding_lookup_14 = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@decoder/embedding_rnn_seq2seq/embedding_rnn_decoder/embedding"], validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](decoder/embedding_rnn_seq2seq/embedding_rnn_decoder/embedding/read, _recv_y13_0)]]
Caused by op 'decoder/embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder/embedding_lookup_14', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2705, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2809, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2869, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-515b8b6eb428>", line 13, in <module>
    xvocab_size, yvocab_size, emb_dim)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/seq2seq.py", line 333, in embedding_rnn_seq2seq
    feed_previous=feed_previous)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/seq2seq.py", line 272, in embedding_rnn_decoder
    loop_function=loop_function)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/seq2seq.py", line 139, in rnn_decoder
    for i, inp in enumerate(decoder_inputs):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/seq2seq.py", line 270, in <genexpr>
    embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/embedding_ops.py", line 86, in embedding_lookup
    validate_indices=validate_indices)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 980, in gather
    validate_indices=validate_indices, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2310, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1232, in __init__
    self._traceback = _extract_stack()
