In [1]:
import tensorflow as tf
import numpy as np
import collections

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Import data

In [2]:
def get_data(data_path=None):
    """Load raw data from data directory "data_path".
    Reads text file, converts strings to integer ids
    Args:
    data_path: string path to the directory
    Returns:
    tuple (raw_data, vocabulary)
    """
  
    data = tf.gfile.GFile(data_path, "r").read().replace("\n", "<eos>").split()
    
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))

    data_in_ids = [word_to_id[word] for word in data if word in word_to_id]
    return data, data_in_ids, word_to_id, id_to_word

In [3]:
# converting a one-hot array into a word
def vec_to_word(vec, id_to_word):
    index = np.argmax(vec, axis=0) # get the index of the most probable word
    word = id_to_word[index]
    return word

In [4]:
raw_data_words, raw_data_ids, word_to_id, id_to_word = get_data("train.txt")
n_words = len(word_to_id)
print(n_words)

745


# Training

### Random batch function

In [5]:
# build a random batch from data
def get_batch(data, batch_size, time_steps, input_size):
    batch = np.zeros([batch_size, time_steps+1, input_size])
    for row in range(batch_size):
        t0 = np.random.randint(0, len(data)-time_steps) # starting time
        batch[row, :, :] = np.eye(input_size)[data[t0:t0+time_steps+1]]
    return batch[:, :-1, :], batch[:, 1:, :]

### Parameters

In [6]:
# Input size
batch_size = 5
time_steps = 10
#epochs = 1 # not considering epoch now

# max number of iterations
iterations = 200

# learning rate
n_neurons = 5
learning_rate = 0.1

### Set up RNN

In [7]:
# Input / Output(target)
X = tf.placeholder(tf.float32, [None, time_steps, n_words])
Y = tf.placeholder(tf.float32, [None, time_steps, n_words])

In [8]:
# Define a gru cell
cell = tf.nn.rnn_cell.GRUCell(n_words)
#cell = tf.contrib.rnn.OutputProjectionWrapper(
#    tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu),
#    output_size=n_words)

# Get gru cell output
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

# Final output layer
#dense = tf.layers.dense(inputs=outputs, units=n_words, activation=None)

# Class probabilities
#probs = tf.nn.softmax(dense)

### Define loss and optimizer

In [9]:
# Loss & optimizer
#loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=probs))
loss = tf.reduce_mean(tf.square(outputs - Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Training
train = optimizer.minimize(loss)

# Initializer
init = tf.global_variables_initializer()

# Save model
saver = tf.train.Saver()

### Run training

In [10]:
# Run the model
with tf.Session() as sess:
    sess.run(init)
    
    for iteration in range(iterations):
        
        x_batch, y_batch = get_batch(
            data=raw_data_ids,
            batch_size=batch_size,
            time_steps=time_steps,
            input_size=n_words)
        
        sess.run(train, feed_dict={X: x_batch, Y: y_batch})
        
        if iteration % 10 == 0:
            loss_ = loss.eval(feed_dict={X: x_batch, Y: y_batch})
            print(iteration, loss_)
    
    saver.save(sess, "./model")

0 0.004818327
10 0.2683465
20 0.27807683
30 0.26325405
40 0.24272978
50 0.23682836
60 0.21883845
70 0.21479118
80 0.18828242
90 0.18007444
100 0.18945682
110 0.18911353
120 0.28000924
130 0.16535111
140 0.15931684
150 0.16522744
160 0.15099347
170 0.1486667
180 0.13406016
190 0.10219694


# Generate new text

Use this sentence to start the prediction: "N years ago the marketing managers were expected researchers."

In [11]:
sentence = ['N', 'years', 'ago', 'the', 'marketing', 'managers', 'were', 'expected', 'researchers', '<eos>']
sentence_ids = [word_to_id[word] for word in sentence]
seed = np.eye(n_words)[sentence_ids].reshape(1, time_steps, n_words)

In [17]:
seed.shape

(1, 10, 745)

In [13]:
n_generated_words = 10

In [16]:
with tf.Session() as sess:
    
    saver.restore(sess, "./model")
    
    for iteration in range(n_generated_words):
        x_pred = seed[:, -time_steps:, :]
        y_pred = sess.run(outputs, feed_dict={X: x_pred})
        seed = np.append(seed, y_pred[:, -1, :], axis=1)

INFO:tensorflow:Restoring parameters from ./model


ValueError: all the input arrays must have same number of dimensions