In [1]:
import helper

source_path = 'data/letters_source.txt'
target_path = 'data/letters_target.txt'

source_sentences = helper.load_data(source_path)
target_sentences = helper.load_data(target_path)

In [6]:
source_sentences[:50].split('\n')

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 '']

In [7]:
target_sentences[:50].split('\n')

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 '']

# preprocess

In [23]:

def extract_character_vocab(data):
    special_words = ['<pad>','<unk>','<s>','</s>']
    set_words = set([character for line in data.split('\n') for character in line])
    int_to_vocab = {word_i:word for word_i, word in enumerate(special_words + list(set_words))}
    vocab_to_int = {word:word_i for word_i, word in int_to_vocab.items()}
    return int_to_vocab, vocab_to_int
    
# Build int2letter and letter2int dicts
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_sentences)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_sentences)

source_letter_ids = [[source_letter_to_int.get(letter, source_letter_to_int['<unk>']) for letter in line] for line in source_sentences.split('\n')]
target_letter_ids = [[target_letter_to_int.get(letter, target_letter_to_int['<unk>']) for letter in line] for line in target_sentences.split('\n')]

print("Example source sequence")
print(source_letter_ids[:3])
print("\n")
print("Example target sequence")
print(target_letter_ids[:3])

Example source sequence
[[5, 18, 4, 28, 28], [16, 20, 19], [15, 5, 29, 6, 12]]


Example target sequence
[[4, 5, 28, 28, 18], [16, 20, 19], [5, 12, 15, 6, 29]]


In [28]:
def pad_id_sequences(source_ids, source_letter_to_int, target_ids, target_letter_to_int, sequence_length):
    new_source_ids = [sentence + [source_letter_to_int['<pad>']]*(sequence_length - len(sentence)) for sentence in source_ids]
    new_target_ids = [sentence + [target_letter_to_int['<pad>']]* (sequence_length-len(sentence)) for sentence in target_ids]
    return new_source_ids, new_target_ids
    
sequence_length = max([len(sentence) for sentence in source_letter_ids]+[len(sentence) for sentence in target_letter_ids])
source_ids, target_ids = pad_id_sequences(source_letter_ids, source_letter_to_int, target_letter_ids, target_letter_to_int, sequence_length)

print("Sequence Length")
print(sequence_length)
print("\n")
print("Input sequence example")
print(source_ids[:3])
print("\n")
print("Target sequence example")
print(target_ids[:3])

Sequence Length
7


Input sequence example
[[5, 18, 4, 28, 28, 0, 0], [16, 20, 19, 0, 0, 0, 0], [15, 5, 29, 6, 12, 0, 0]]


Target sequence example
[[4, 5, 28, 28, 18, 0, 0], [16, 20, 19, 0, 0, 0, 0], [5, 12, 15, 6, 29, 0, 0]]


# model

In [96]:
from distutils.version import LooseVersion
import tensorflow as tf
tf.reset_default_graph
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'please use tensorflow version 1.0 or new'
print('tensorflow version: {}'.format(tf.__version__))

tensorflow version: 1.0.1


In [97]:
epochs = 60
batch_size = 128
rnn_size = 50
num_layers = 2
encoding_embedding_size = 13
decoding_embedding_size = 13
learning_rate = 0.001

In [98]:
input_data = tf.placeholder(tf.int32, [batch_size, sequence_length])
targets = tf.placeholder(tf.int32, [batch_size, sequence_length])
learningRate = tf.placeholder(tf.float32)

# encoding

In [100]:
source_vocab_size = len(source_letter_to_int)

# Encoder embedding
enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)

# Encoder
enc_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnn_size)] * num_layers)
_, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, dtype=tf.float32)

ValueError: Variable rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "/home/vinay/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 747, in _linear
    "weights", [total_arg_size, output_size], dtype=dtype)
  File "/home/vinay/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 179, in __call__
    concat = _linear([inputs, h], 4 * self._num_units, True, scope=scope)
  File "/home/vinay/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 655, in __call__
    cur_inp, new_state = cell(cur_inp, cur_state)


# process decoding input

In [78]:
import numpy as np

# Process the input we'll feed to the decoder
ending = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
dec_input = tf.concat([tf.fill([batch_size, 1], target_letter_to_int['<s>']), ending], 1)

demonstration_outputs = np.reshape(range(batch_size * sequence_length), (batch_size, sequence_length))

sess = tf.InteractiveSession()
print("Targets")
print(demonstration_outputs[:2])
print("\n")
print("Processed Decoding Input")
print(sess.run(dec_input, {targets: demonstration_outputs})[:2])

Targets
[[ 0  1  2  3  4  5  6]
 [ 7  8  9 10 11 12 13]]


Processed Decoding Input
[[ 2  0  1  2  3  4  5]
 [ 2  7  8  9 10 11 12]]


# decoding

In [85]:
target_vocab_size = len(target_letter_to_int)

#decoding embedding
dec_embedding = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
print(dec_embedding)
dec_embed_input = tf.nn.embedding_lookup(dec_embedding, dec_input)
print(dec_embed_input)

Tensor("Variable_4/read:0", shape=(30, 13), dtype=float32)
Tensor("embedding_lookup:0", shape=(128, 7, 13), dtype=float32)
