In [1]:
import tensorflow as tf
import numpy as np
import collections

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Import data

In [2]:
def get_data(data_path=None):
    """Load raw data from data directory "data_path".
    Reads text file, converts strings to integer ids
    Args:
    data_path: string path to the directory
    Returns:
    tuple (raw_data, vocabulary)
    """
  
    data = list(tf.gfile.GFile(data_path, "r").read().replace("\n", "").lower())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    chars, _ = list(zip(*count_pairs))
    char_to_id = dict(zip(chars, range(len(chars))))
    id_to_char = dict(zip(range(len(chars)), chars))

    data_in_ids = [char_to_id[char] for char in data]
    return data, data_in_ids, char_to_id, id_to_char

In [3]:
# converting a one-hot array into a word
def vec_to_char(vec, id_to_char):
    index = np.argmax(vec, axis=0) # get the index of the most probable word
    char = id_to_char[index]
    return char

In [4]:
raw_data_chars, raw_data_ids, char_to_id, id_to_char = get_data("bible_1000.txt")
n_chars = len(char_to_id)
print(n_chars)

36


# Training

### Random batch function

In [None]:
# build a random batch from data
def get_batch(data, batch_size, time_steps, input_size):
    batch = np.zeros([batch_size, time_steps+1, input_size])
    for row in range(batch_size):
        t0 = np.random.randint(0, len(data)-time_steps) # starting time
        batch[row, :, :] = np.eye(input_size)[data[t0:t0+time_steps+1]]
    return batch[:, :-1, :], batch[:, 1:, :]

### Parameters

In [None]:
# Input size
batch_size = 10
time_steps = 200
#epochs = 1 # not considering epoch now

# max number of iterations
iterations = 500

# learning rate
learning_rate = 0.1

### Set up RNN

In [None]:
# Input / Output(target)
X = tf.placeholder(tf.float32, [None, time_steps, n_chars])
Y = tf.placeholder(tf.float32, [None, time_steps, n_chars])

In [None]:
# Define a gru cell
cell = tf.nn.rnn_cell.GRUCell(n_chars)

# Get gru cell output
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

In [None]:
# Final output layer
dense = tf.layers.dense(inputs=outputs, units=n_chars, activation=None)

# Class probabilities
probs = tf.nn.softmax(dense)

### Define loss and optimizer

In [None]:
# Loss & optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=probs))
#loss = tf.reduce_mean(tf.square(outputs - Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Training
train = optimizer.minimize(loss)

# Initializer
init = tf.global_variables_initializer()

# Save model
saver = tf.train.Saver()

### Run training

In [None]:
# Run the model
with tf.Session() as sess:
    sess.run(init)
    
    for iteration in range(iterations):
        
        x_batch, y_batch = get_batch(
            data=raw_data_ids,
            batch_size=batch_size,
            time_steps=time_steps,
            input_size=n_chars)
        
        sess.run(train, feed_dict={X: x_batch, Y: y_batch})
        
        if iteration % 10 == 0:
            loss_ = loss.eval(feed_dict={X: x_batch, Y: y_batch})
            print(iteration, loss_)
    
    saver.save(sess, "./model/")

0 3.560939
10 3.4405751
20 3.4376385
30 3.437124
40 3.4356363
50 3.441349
60 3.438039
70 3.428631
80 3.4326198
90 3.4306483
100 3.4311433
110 3.4341393
120 3.4286096
130 3.4355512
140 3.4276896
150 3.4371119
160 3.4471247
170 3.4362147
180 3.4350996
190 3.4346445
200 3.4281209
210 3.439109
220 3.443139
230 3.4431329


# Generate new text

In [None]:
seed_chars, _, _, _ = get_data("bible_seed.txt")
seed_chars = seed_chars[:time_steps]
seed_ids = [char_to_id[char] for char in seed_chars]
seed_sentence = "".join(seed_chars)
print(seed_sentence)

In [None]:
pred_iterations = 10
new_seed_ids = seed_ids.copy()
with tf.Session() as sess:
    
    saver.restore(sess, "./model/")
    
    for iteration in range(pred_iterations):
        print(iteration)
        x_pred_ids = new_seed_ids[-time_steps:]
        x_pred = np.eye(n_chars)[x_pred_ids].reshape(1, time_steps, n_chars)
        pred_probs = sess.run(outputs, feed_dict={X: x_pred})
        pred_last_index = np.argmax(pred_probs[:, -1:, :], axis=2)[0][0]
        pred_last_char = id_to_char[pred_last_index]
        new_seed_ids = np.append(new_seed_ids, pred_last_index)

In [None]:
final_sentence = "".join([id_to_char[id] for id in new_seed_ids])
print(final_sentence)