In [1]:
import helper
import unittests

In [2]:
target_text, source_text = helper.load_data("./data/raw_data.json")

In [3]:
import numpy as np

view_article_range = (0, 10)

print("Dataset Stats")
print("Roughly The number of unique words: {}".format(len(set(word for word in ' '.join(source_text).split()))))

word_counts = [len(article.split()) for article in source_text]
print("Number of articles: {}".format(len(source_text)))
print("Average Number of words per article: {}".format(np.average(word_counts)))

print()
print("Example Article Heading")
print(target_text[0])
print()
print("Example Article Body")
print(source_text[0])

Dataset Stats
Roughly The number of unique words: 69732
Number of articles: 2225
Average Number of words per article: 375.11011235955056

Example Article Heading
Ad sales boost Time Warner profit

Example Article Body
Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on th

In [4]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    
    x = [[source_vocab_to_int[word] for word in article.split()] for article in source_text]
    y = [[target_vocab_to_int[word] for word in article.split()] for article in target_text]

    for i in range(len(y)):
        y.append(target_vocab_to_int['<EOS>'])
        
    return x, y
    
unittests.test_text_to_ids(text_to_ids)

Tests Passed


In [5]:
helper.preprocess_and_save_data("./data/raw_data.json", text_to_ids)

In [6]:
import numpy as np
import helper

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()

In [7]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print("Tensorflow Version: {}".format(tf.__version__))

if not tf.test.gpu_device_name():
    warnings.warn("No GPU found. Please use a GPU to train your neural network")
else:
    print('Default GPU device: {}'.format(tf.test.gpu_device_name()))

Tensorflow Version: 1.0.0




In [8]:
def model_inputs():
    return tf.placeholder(tf.int32, (None, None), name="input"), tf.placeholder(tf.int32, (None, None), name="targets"), tf.placeholder(tf.float32, name="learn_rate"), tf.placeholder(tf.float32, name="keep_prob")

unittests.test_model_inputs(model_inputs)

Tests Passed


In [9]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, (0, 0), (batch_size, -1), (1, 1))
    dec_input = tf.concat([tf.fill([batch_size, 1], target_vocab_to_int["<GO>"]), ending], 1)
    return dec_input

unittests.test_process_decoding_input(process_decoding_input)

Tests Passed


In [10]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob):
    LSTM = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([LSTM] * num_layers)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    RNN_output, RNN_state = tf.nn.dynamic_rnn(cell, rnn_inputs, dtype=tf.float32)
    return RNN_state

unittests.test_encoding_layer(encoding_layer)

Tests Passed


In [11]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope, output_fn, keep_prob):
    train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state)
    
    train_pred, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(
        dec_cell,
        train_decoder_fn,
        dec_embed_input,
        sequence_length,
        scope=decoding_scope
    )
    
    train_logits = output_fn(train_pred)
    
    return tf.nn.dropout(train_logits, keep_prob)

unittests.test_decoding_layer_train(decoding_layer_train)

Tests Passed


In [12]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id,
                         maximum_length, vocab_size, decoding_scope, output_fn, keep_prob):
    
    infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(
        output_fn, encoder_state, dec_embeddings, start_of_sequence_id, end_of_sequence_id, maximum_length, vocab_size
    )
    
    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, infer_decoder_fn, scope=decoding_scope)
    
    return inference_logits

# unittests.test_decoding_layer_infer(decoding_layer_infer)

In [13]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, sequence_length, rnn_size,
                   num_layers, target_vocab_to_int, keep_prob):
    
    start_of_sequence_id = target_vocab_to_int['<GO>']
    end_of_sequence_id = target_vocab_to_int['<EOS>']
        
    dec_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnn_size)] * num_layers)
    
    with tf.variable_scope("decoding") as decoding_scope:
        # Output Layer
        output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, None, scope=decoding_scope)
            
    with tf.variable_scope("decoding") as decoding_scope:
        training_logits = decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope, output_fn, keep_prob)
        
    with tf.variable_scope("decoding", reuse=True) as decoding_scope:
        inference_logits = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id, sequence_length, vocab_size, decoding_scope, output_fn, keep_prob)

    return training_logits, inference_logits


In [14]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length, source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, rnn_size, num_layers, target_vocab_to_int):

    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, enc_embedding_size)
    encoder_state = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob)
    
    dec_input = process_decoding_input(target_data, target_vocab_to_int, batch_size)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, dec_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    train, infer = decoding_layer(dec_embed_input, dec_embeddings, encoder_state, target_vocab_size, sequence_length, rnn_size,
                                  num_layers, target_vocab_to_int, keep_prob)

    return (train, infer)


In [15]:
# Number of Epochs
epochs = 15
# Batch Size
batch_size = 512
# RNN Size
rnn_size = 128
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 64
decoding_embedding_size = 64
# Learning Rate
learning_rate = 0.001
# Dropout Keep Probability
keep_probability = 0.7

In [24]:
save_path = 'checkpoints/dev'
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()
max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob = model_inputs()
    sequence_length = tf.placeholder_with_default(max_target_sentence_length, None, name='sequence_length')
    input_shape = tf.shape(input_data)
    
    train_logits, inference_logits = seq2seq_model(
        tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length, len(source_vocab_to_int), len(target_vocab_to_int),
        encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, target_vocab_to_int)

    tf.identity(inference_logits, 'logits')
    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            train_logits,
            targets,
            tf.ones([input_shape[0], sequence_length]))

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
import time

def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target_batch,
            [(0,0),(0,max_seq - target_batch.shape[1]), (0,0)],
            'constant')
    if max_seq - batch_train_logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1]), (0,0)],
            'constant')

    return np.mean(np.equal(target, np.argmax(logits, 2)))

train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]

valid_source = helper.pad_sentence_batch(source_int_text[:batch_size])
valid_target = helper.pad_sentence_batch(target_int_text[:batch_size])

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch) in enumerate(
                helper.batch_data(train_source, train_target, batch_size)):
            start_time = time.time()
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 sequence_length: target_batch.shape[1],
                 keep_prob: keep_probability})
            
            batch_train_logits = sess.run(
                inference_logits,
                {input_data: source_batch, keep_prob: 1.0})
            batch_valid_logits = sess.run(
                inference_logits,
                {input_data: valid_source, keep_prob: 1.0})
                
            train_acc = get_accuracy(target_batch, batch_train_logits)
            valid_acc = get_accuracy(np.array(valid_target), batch_valid_logits)
            end_time = time.time()
            print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.3f}, Validation Accuracy: {:>6.3f}, Loss: {:>6.3f}'
                  .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')