In [1]:
import tensorflow as tf
import numpy as np
import collections

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Import data

In [2]:
def get_data(data_path=None):
    """Load raw data from data directory "data_path".
    Reads text file, converts strings to integer ids
    Args:
    data_path: string path to the directory
    Returns:
    tuple (raw_data, vocabulary)
    """
  
    data = tf.gfile.GFile(data_path, "r").read().replace("\n", "<eos> ").replace(".", "").replace(",", "").replace(";", "").split()    
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))

    data_in_ids = [word_to_id[word] for word in data if word in word_to_id]
    return data, data_in_ids, word_to_id, id_to_word

In [3]:
# converting a one-hot array into a word
def vec_to_word(vec, id_to_word):
    index = np.argmax(vec, axis=0) # get the index of the most probable word
    word = id_to_word[index]
    return word

In [4]:
raw_data_words, raw_data_ids, word_to_id, id_to_word = get_data("bible_100.txt")
n_words = len(word_to_id)
print(n_words)

643


# Training

### Random batch function

In [5]:
# build a random batch from data
def get_batch(data, batch_size, time_steps, input_size):
    batch = np.zeros([batch_size, time_steps+1, input_size])
    for row in range(batch_size):
        t0 = np.random.randint(0, len(data)-time_steps) # starting time
        batch[row, :, :] = np.eye(input_size)[data[t0:t0+time_steps+1]]
    return batch[:, :-1, :], batch[:, 1:, :]

### Parameters

In [6]:
# Input size
batch_size = 10
time_steps = 10
#epochs = 1 # not considering epoch now

# max number of iterations
iterations = 200

# learning rate
learning_rate = 0.1

### Set up RNN

In [7]:
# Input / Output(target)
X = tf.placeholder(tf.float32, [None, time_steps, n_words])
Y = tf.placeholder(tf.float32, [None, time_steps, n_words])

In [8]:
# Define a gru cell
cell = tf.nn.rnn_cell.GRUCell(n_words)

# Get gru cell output
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

### Define loss and optimizer

In [9]:
# Loss & optimizer
loss = tf.reduce_mean(tf.square(outputs - Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Training
train = optimizer.minimize(loss)

# Initializer
init = tf.global_variables_initializer()

# Save model
saver = tf.train.Saver()

### Run training

In [10]:
# Run the model
with tf.Session() as sess:
    sess.run(init)
    
    for iteration in range(iterations):
        
        x_batch, y_batch = get_batch(
            data=raw_data_ids,
            batch_size=batch_size,
            time_steps=time_steps,
            input_size=n_words)
        
        sess.run(train, feed_dict={X: x_batch, Y: y_batch})
        
        if iteration % 10 == 0:
            loss_ = loss.eval(feed_dict={X: x_batch, Y: y_batch})
            print(iteration, loss_)
    
    saver.save(sess, "./model")

0 0.0046547707
10 0.036068976
20 0.2511616
30 0.24094781
40 0.15811719
50 0.075291894
60 0.0680719
70 0.036473114
80 0.04251855
90 0.105542675
100 0.035253294
110 0.03041741
120 0.09842419
130 0.016475076
140 0.01389665
150 0.009611361
160 0.009774543
170 0.09145613
180 0.09465058
190 0.011495933


# Generate new text

Use this sentence to start the prediction: "N years ago the marketing managers were expected researchers."

In [11]:
seed_ids = np.random.randint(0, n_words, 10)
seed_sentence = [id_to_word[id] for id in seed_ids]
print(seed_sentence)
initial_seed = np.eye(n_words)[seed_ids].reshape(1, time_steps, n_words)

['heaven', 'beginning', 'Woman,', 'way,', 'garden', 'waters.', 'breath', 'evil,', 'him.', 'eat:']


In [12]:
pred_iterations = 10
seed = initial_seed.copy()
with tf.Session() as sess:
    
    saver.restore(sess, "./model")
    
    for iteration in range(pred_iterations):
        print(iteration)
        x_pred = seed[:, -time_steps:, :]
        pred_probs = sess.run(outputs, feed_dict={X: x_pred})
        pred_last_index = np.argmax(pred_probs[:, -1:, :], axis=2)
        pred_last_word = id_to_word[pred_last_index[0][0]]
        seed_to_add = np.eye(n_words)[pred_last_index]
        seed = np.append(seed, seed_to_add, axis=1)

INFO:tensorflow:Restoring parameters from ./model
0
1
2
3
4
5
6
7
8
9


In [13]:
final_sentence_ids = np.argmax(seed, axis=2)
final_sentence = [id_to_word[id] for id in final_sentence_ids[0]]
print(final_sentence)

['heaven', 'beginning', 'Woman,', 'way,', 'garden', 'waters.', 'breath', 'evil,', 'him.', 'eat:', 'lights', 'kill', 'night;', 'earth', 'blessed', 'where', 'Earth;', 'kill', 'where', 'kill']
