In [1]:
import tensorflow as tf
import numpy as np
import collections

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Import data

In [2]:
def get_data(data_path=None):
    """Load raw data from data directory "data_path".
    Reads text file, converts strings to integer ids
    Args:
    data_path: string path to the directory
    Returns:
    tuple (raw_data, vocabulary)
    """
  
    data = list(tf.gfile.GFile(data_path, "r").read().replace("\n", "").lower())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    chars, _ = list(zip(*count_pairs))
    char_to_id = dict(zip(chars, range(len(chars))))
    id_to_char = dict(zip(range(len(chars)), chars))

    data_in_ids = [char_to_id[char] for char in data]
    return data, data_in_ids, char_to_id, id_to_char

In [3]:
# converting a one-hot array into a word
def vec_to_char(vec, id_to_char):
    index = np.argmax(vec, axis=0) # get the index of the most probable word
    char = id_to_char[index]
    return char

In [4]:
raw_data_chars, raw_data_ids, char_to_id, id_to_char = get_data("bible_1000.txt")
n_chars = len(char_to_id)
print(n_chars)

36


# Training

### Random batch function

In [5]:
# build a random batch from data
def get_batch(data, batch_size, time_steps, input_size):
    batch = np.zeros([batch_size, time_steps+1, input_size])
    for row in range(batch_size):
        t0 = np.random.randint(0, len(data)-time_steps) # starting time
        batch[row, :, :] = np.eye(input_size)[data[t0:t0+time_steps+1]]
    return batch[:, :-1, :], batch[:, 1:, :]

### Parameters

In [6]:
# Input size
batch_size = 10
time_steps = 200
#epochs = 1 # not considering epoch now

# max number of iterations
iterations = 500

# learning rate
learning_rate = 0.1

### Set up RNN

In [7]:
# Input / Output(target)
X = tf.placeholder(tf.float32, [None, time_steps, n_chars])
Y = tf.placeholder(tf.float32, [None, time_steps, n_chars])

In [8]:
# Define a gru cell
cell = tf.nn.rnn_cell.LSTMCell(n_chars)

# Get gru cell output
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

In [9]:
# Final output layer
#dense = tf.layers.dense(inputs=outputs, units=n_chars, activation=None)

# Class probabilities
#probs = tf.nn.softmax(dense)

### Define loss and optimizer

In [10]:
# Loss & optimizer
#loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=probs))
loss = tf.reduce_mean(tf.square(outputs - Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Training
train = optimizer.minimize(loss)

# Initializer
init = tf.global_variables_initializer()

# Save model
saver = tf.train.Saver()

### Run training

In [11]:
# Run the model
with tf.Session() as sess:
    sess.run(init)
    
    for iteration in range(iterations):
        
        x_batch, y_batch = get_batch(
            data=raw_data_ids,
            batch_size=batch_size,
            time_steps=time_steps,
            input_size=n_chars)
        
        sess.run(train, feed_dict={X: x_batch, Y: y_batch})
        
        if iteration % 10 == 0:
            loss_ = loss.eval(feed_dict={X: x_batch, Y: y_batch})
            print(iteration, loss_)
    
    saver.save(sess, "./model/")

0 0.031239092
10 0.023259386
20 0.021676524
30 0.020824842
40 0.020563284
50 0.01998036
60 0.019990576
70 0.019611323
80 0.019201703
90 0.019306494
100 0.018365458
110 0.018932305
120 0.01885848
130 0.01912328
140 0.018751925
150 0.018605972
160 0.018472342
170 0.017465238
180 0.018234428
190 0.017806942
200 0.018157022
210 0.018278059
220 0.01760551
230 0.017699609
240 0.017951427
250 0.01727437
260 0.01785046
270 0.017936451
280 0.017850341
290 0.01729503
300 0.017437464
310 0.01714337
320 0.017585019
330 0.01791961
340 0.017491262
350 0.017071426
360 0.017468138
370 0.0174489
380 0.017486533
390 0.017523734
400 0.017611077
410 0.017182712
420 0.016658604
430 0.016616503
440 0.016483497
450 0.016781338
460 0.016957931
470 0.017228471
480 0.017501267
490 0.017206803


# Generate new text

In [12]:
seed_chars, _, _, _ = get_data("bible_seed.txt")
seed_chars = seed_chars[:time_steps]
seed_ids = [char_to_id[char] for char in seed_chars]
seed_sentence = "".join(seed_chars)
print(seed_sentence)

these men are peaceable with us; therefore let them dwell in the land, and trade therein; for the land, behold, it is large enough for them; let us take their daughters to us for wives, and let us giv


In [13]:
pred_iterations = 100
new_seed_ids = seed_ids.copy()
with tf.Session() as sess:
    
    saver.restore(sess, "./model/")
    
    for iteration in range(pred_iterations):
        print(iteration)
        x_pred_ids = new_seed_ids[-time_steps:]
        x_pred = np.eye(n_chars)[x_pred_ids].reshape(1, time_steps, n_chars)
        pred_probs = sess.run(outputs, feed_dict={X: x_pred})
        pred_last_index = np.argmax(pred_probs[:, -1:, :], axis=2)[0][0]
        pred_last_char = id_to_char[pred_last_index]
        new_seed_ids = np.append(new_seed_ids, pred_last_index)

INFO:tensorflow:Restoring parameters from ./model/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [14]:
final_sentence = "".join([id_to_char[id] for id in new_seed_ids])
print(final_sentence)

these men are peaceable with us; therefore let them dwell in the land, and trade therein; for the land, behold, it is large enough for them; let us take their daughters to us for wives, and let us give the said, and the said, and the said, and the said, and the said, and the said, and the said, and 
