# change generator to split text more efficiently

In [1]:
import numpy as np
from lib.dictionarymd import Dictionary
from lib.textprocessingmd import convert_text
from lib.fileoperationmd import getFilesFromPath,readTxtFromFile

[nltk_data] Downloading package stopwords to /home/zein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class HyperParameters:
    DATAPATH='./data/shakespeare/'
    VOCPATH='./preprocessed'
    VOCFILE='articles.voc'

    MAXLENGTH=1000  # MAX Length of the document
    K=3 # beam search
    n_hidden_rnn=200
    embedding_dim=200
    
    batch_size = 1
    n_epochs = 500
    learning_rate = 0.01
    learning_rate_decay = 1#.41
    dropout_keep_probability = 0.6

hp=HyperParameters()

In [3]:
filenames=getFilesFromPath(hp.DATAPATH)
dataset=[]
text=""
for fn in filenames:
    ntext=convert_text(readTxtFromFile(hp.DATAPATH,fn))
    text+=ntext
    dataset.append(ntext)
    break

dictionary=Dictionary()
dictionary.make_vocab(text,hp.VOCPATH,hp.VOCFILE)
dictionary.load_vocab(hp.VOCPATH,hp.VOCFILE)

In [4]:
# I can understand from this code that multiple batches maybe with different maxlen
def batches_generator(batch_size, docs,dictionary,
                      shuffle=False, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(docs)
    vecs=[dictionary.text2vec(doc)[:hp.MAXLENGTH] for doc in docs]
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        lengths=[len(s) for s in vecs]
        order = np.argsort(lengths)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(vecs[idx][:-1])
            y_list.append(vecs[idx][1:])
            max_len_token = max(max_len_token, len(vecs[idx]))  #why?!
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * dictionary.word2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * dictionary.word2idx['<PAD>']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

## Build a recurrent neural network

In [5]:
# modifications
# 1. add training/inference variable
# 2. define placeholder for state [NONE,2*stateLength]
# 3. if inference use state placeholder
# 4. modify beam search
# 5. try to use built-in beam search

In [6]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [7]:
class LSTMModel():
    pass

In [8]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""

    # Placeholders for input and ground truth output.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
    self.initial_state_h= tf.placeholder(dtype=tf.float32, shape=[None, hp.n_hidden_rnn], name='initial_state_h')
    self.initial_state_c= tf.placeholder(dtype=tf.float32, shape=[None, hp.n_hidden_rnn], name='initial_state_c')
    
    # Placeholder for lengths of the sequences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    # Placeholder for a dropout keep probability. If we don't feed
    # a value for this placeholder, it will be equal to 1.0.
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, dtype=tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_ph')

In [9]:
LSTMModel.__declare_placeholders = classmethod(declare_placeholders)

In [10]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
    """Specifies bi-LSTM architecture and computes logits for inputs."""
    
    # Create embedding variable (tf.Variable) with dtype tf.float32
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix_variable = tf.Variable(initial_value=initial_embedding_matrix, dtype=tf.float32, name='embeddings_matrix')

    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units 
    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.
    
    forward_cell =  tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn),
                                                  input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)
    #backward_cell = tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn), input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)
                                                  
    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
    # Shape: [batch_size, sequence_len, embedding_dim].
    embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)

    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
    rnn_output, self.states = tf.nn.dynamic_rnn(cell=forward_cell,
                                      initial_state=tf.nn.rnn_cell.LSTMStateTuple(self.initial_state_c, self.initial_state_h),
                                      sequence_length=self.lengths,
                                      dtype=tf.float32,
                                      inputs=embeddings)
    #rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)
    
    # Dense layer on top.
    # Shape: [batch_size, sequence_len, n_tags].   
    self.logits = tf.layers.dense(rnn_output, vocabulary_size, activation=None)

In [11]:
LSTMModel.__build_layers = classmethod(build_layers)

In [12]:
def compute_predictions(self):
    """Transforms logits to probabilities and finds the most probable tags."""
    
    # Create softmax (tf.nn.softmax) function
    self.softmax_output = tf.nn.softmax(logits=self.logits)
    
    # Use argmax (tf.argmax) to get the most probable tags
    # Don't forget to set axis=-1
    # otherwise argmax will be calculated in a wrong way
    self.predictions = tf.argmax(self.softmax_output, axis=-1)

In [13]:
LSTMModel.__compute_predictions = classmethod(compute_predictions)

In [14]:
def compute_loss(self, vocabulary_size, PAD_index):
    """Computes masked cross-entopy loss with logits."""
    
    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, vocabulary_size)
    #loss_tensor = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=ground_truth_tags_one_hot)
    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), dtype=tf.float32)
    loss_tensor = tf.contrib.seq2seq.sequence_loss(
        logits=self.logits,
        targets=self.ground_truth_tags,
        weights=mask)
    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)
    # Be careful that the argument of tf.reduce_mean should be
    # multiplication of mask and loss_tensor.
    self.loss = tf.reduce_mean(loss_tensor)#np.multiply(mask, loss_tensor))

In [15]:
LSTMModel.__compute_loss = classmethod(compute_loss)

In [16]:
def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
    
    # Create an optimizer (tf.train.AdamOptimizer)
    self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
    # Pay attention that you need to apply this operation only for gradients 
    # because self.grads_and_vars contains also variables.
    # list comprehension might be useful in this case.
    clip_norm = tf.cast(1.0, dtype=tf.float32)  ##??
    self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in self.grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [17]:
LSTMModel.__perform_optimization = classmethod(perform_optimization)

In [18]:
def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()

In [19]:
LSTMModel.__init__ = classmethod(init_model)

In [20]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.initial_state_h: np.zeros((lengths.shape[0],hp.n_hidden_rnn)),
                 self.initial_state_c: np.zeros((lengths.shape[0],hp.n_hidden_rnn)),
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)

In [21]:
LSTMModel.train_on_batch = classmethod(train_on_batch)

In [22]:
def predict_for_batch(self, session, x_batch,init_c,init_h):
    lengths=np.array([1000000]*len(x_batch))
    feed_dict = {self.input_batch: x_batch,
                 self.initial_state_h: init_h,
                 self.initial_state_c: init_c,
                 self.lengths: lengths}
    k=3
    softmax, states = session.run([self.softmax_output ,self.states], feed_dict=feed_dict)
    return softmax, states

In [23]:
LSTMModel.predict_for_batch = classmethod(predict_for_batch)

# Train

In [24]:
tf.reset_default_graph()

model = LSTMModel(vocabulary_size=len(dictionary.word2idx), n_tags=len(dictionary.word2idx), embedding_dim=hp.embedding_dim,
                  n_hidden_rnn=hp.n_hidden_rnn, PAD_index=dictionary.word2idx['<PAD>'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [25]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
model_checkpoint = './model.chkpt'
saver = tf.train.Saver()

In [26]:
class Beam:
    def __init__(self,K):
        self.state_c=[]
        self.state_h=[]
        self.probabilities=[]
        self.lastOutput=[]
        self.history=[]
        self.K=K
    def addBeam(self,lastOutput,prob,state_c,state_h,history):
        self.lastOutput.append(lastOutput)
        self.probabilities.append(prob)
        self.state_c.append(state_c)
        self.state_h.append(state_h)
        current_hist=history.copy()
        current_hist.extend(lastOutput)
        self.history.append(current_hist)
    def getTopK(self):
        topK=np.argsort(self.probabilities)[-self.K:]
        lenTop=len(topK)
        state_c,state_h=np.zeros((lenTop,200)),np.zeros((lenTop,200))
        prob=[]
        lastOutput=[]
        history=[]
        for i,k in enumerate(topK):
            lastOutput.append(self.lastOutput[k])
            history.append(self.history[k])
            prob.append(self.probabilities[k])
            state_c[i,:]=self.state_c[k]
            state_h[i,:]=self.state_h[k]
        return lastOutput,prob,state_c,state_h,history


In [27]:
# test Beam
beam=Beam(3)
vec=dictionary.text2vec('Hi')
prob=1
state_c,state_h=np.zeros((1,200)),np.zeros((1,200))
beam.addBeam(vec+[0],0.8,state_c,state_h,[])
beam.addBeam(vec+[1],0.9,state_c+1,state_h+1,[])
beam.addBeam(vec+[2],0.1,state_c+2,state_h+2,[])
beam.addBeam(vec+[3],0.96,state_c+3,state_h+3,[])
lastOutputs,probs,state_c,state_h,history=beam.getTopK()

assert (lastOutputs[0][2]==0 and lastOutputs[1][2]==1 and lastOutputs[2][2]==3)
assert probs==[0.8, 0.9, 0.96]
assert state_c[0,0]==0 and state_c[1,0]==1 and state_c[2,0]==3 
assert state_h[0,0]==0 and state_h[1,0]==1 and state_h[2,0]==3 
lastOutputs

[[2, 1, 0], [2, 1, 1], [2, 1, 3]]

In [28]:
# modification
# add session,model to parameters
def beam_search(num_generated,seed):
    """ 
    Parameters
            num_generated: number of tokens to be generated.
            seed: initial sentence of the text.
            K: default value is 3;  #number of sequences to track
    return: topK candidates of the generated text
    """

    beam=Beam(hp.K)
    vec=dictionary.text2vec(seed)  # 1xL 
    state_c,state_h=np.zeros((1,200)),np.zeros((1,200)) # initial states
    p=1
    beam.addBeam(vec,p,state_c,state_h,[])

    for i in range(num_generated):
        lastOutputs,probs,state_c,state_h,history=beam.getTopK()

        softmax,states=model.predict_for_batch(sess,lastOutputs,state_c,state_h) # softmax [NONE,Len,10000]
                                                                                 # states ([NONE,200],[NONE,200])
        state_c,state_h=states

        beam=Beam(hp.K)
        topK=np.argsort(softmax[:,-1])[:,-hp.K:] #[None,K]
        for i in range(topK.shape[0]):
            for j in range(hp.K):
                cand=topK[i,j]
                if cand==1:continue
                vec=[cand]
                st_c=state_c[i,:]
                st_h=state_h[i,:]
                p=softmax[i,-1,cand]*probs[i]
                hist=history[i]
                beam.addBeam(vec,p,st_c,st_h,hist)
    lastOutputs,probs,state_c,state_h,history=beam.getTopK()
    return history

In [29]:
%time
def getBestCandidate(num_tokens,seed):
    history=beam_search(num_tokens,seed)
    return dictionary.vec2text(history[-1])
#getBestCandidate(500,'just')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.34 µs


In [30]:
from IPython.display import clear_output
learning_rate=hp.learning_rate
print('Start training... \n')
results=""
for epoch in range(hp.n_epochs):
    if epoch%10==0:
        newc=getBestCandidate(10,"How")
        results+="\nEpoch {}: {}".format(epoch,newc)
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(hp.n_epochs) + '-' * 20+results)
    
    counter=0
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(hp.batch_size, dataset,dictionary):
        
        clear_output(wait=True)
        counter=counter+1
        print("ُEboch {}/{}.batch {}/{} {}".format(epoch+1,hp.n_epochs,counter,len(dataset)//hp.batch_size,results))
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, hp.dropout_keep_probability)
    saver.save(sess, model_checkpoint, global_step=epoch,write_meta_graph=False)
    # Decaying the learning rate
    learning_rate = learning_rate / hp.learning_rate_decay
    
print('...training finished.')

ُEboch 500/500.batch 1/1 
Epoch 0: <START> How graced JOHN silence enemies unmoan caitiff unmoan unmoan for kings
Epoch 10: <START> How #tab #tab #tab #tab #tab #tab #tab #tab #tab #tab
Epoch 20: <START> How #tab #tab #tab #tab #tab #tab #tab #tab #tab #tab
Epoch 30: <START> How #tab ( : ) #tab ( : ) #tab (
Epoch 40: <START> How #tab ( : ) #tab ( of : ) #tab
Epoch 50: <START> How #tab ( : ) #tab ( : ) #tab (
Epoch 60: <START> How #tab ( Ghost of . ( : ) #tab (
Epoch 70: <START> How #tab ( Ghost of Edward : ) #tab ( Ghost
Epoch 80: <START> How #tab ( Ghost of York . ( Ghost of York
Epoch 90: <START> How #tab ( Second Citizen : ) #tab ( Ghost of
Epoch 100: <START> How #tab afterwards King Edward IV . ( QUEEN MARGARET :
Epoch 110: <START> How #tab ( Ghost of York . ( QUEEN MARGARET :
Epoch 120: <START> How #tab ( Ghost of Wales , ( RICHMOND : )
Epoch 130: <START> How #tab ( Ghost of York . ( QUEEN MARGARET :
Epoch 140: <START> How #tab ( Ghost of York . ( Boy : )
Epoch 150: <START> How #t

In [34]:
%time
# Generate text of 1000 words
print(getBestCandidate(1000,'How'))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.3 µs
<START> How KING RICHARD III #tab Duke of Gloucester , ( GLOUCESTER : ) #tab | Brothers to #tab #tab | ( GREY #tab ( GREY : ) #tab | ( Gentleman : ) Called also EARL of SURREY #tab His son . ( SURREY : ) Called also EARL of SURREY #tab His son . ( SURREY : ) #tab | Gentlemen attending on the Lady Anne . ( Lord Mayor of Wales , ( PRINCE EDWARD : ) #tab | Sons to Elizabeth . ( RIVERS #tab ( GREY : ) Called also EARL of DERBY . ( DERBY : ) Called also EARL of DERBY . ( SURREY : ) Called also EARL of DERBY . ( DERBY : ) #tab | Gentlemen attending on the Lady Anne . ( Lord Mayor of Wales , ( PRINCE EDWARD : ) #tab | Sons to Elizabeth . ( RIVERS #tab ( GREY : ) Called also EARL of DERBY . ( DERBY : ) Called also EARL of DERBY . ( SURREY : ) Called also EARL of DERBY . ( DERBY : ) #tab | Gentlemen attending on the Lady Anne . ( Lord Mayor of Wales , ( PRINCE EDWARD : ) #tab | Sons to Elizabeth . ( RIVERS #tab ( GREY : ) Called al

In [32]:
#todo
# version 1
# check inference
# check beam-search

# version2
# Evaluation

# char-level

# Russian
# Arabic 
# French