In this version(1.3): 

- Use bbc-dataset
- try to fix save/load model

In the upcoming versions:
- Evaluation using perplexity
- Beam search, randomize to not repeat the same sentence
- apply to other languages
    - Russian
    - Arabic
    - French
- apply this network for char-level language modeling.
    
For restoring previous check-point, I was forced to define the whole model in this notebook, rather that importing it from an external .py module.

In [1]:
import numpy as np
from lib.dictionarymd import Dictionary
from lib.textprocessingmd import convert_text
from lib.fileoperationmd import getFilesFromPath,readTxtFromFile

[nltk_data] Downloading package stopwords to /home/zein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from hyper import HyperParameters
hp=HyperParameters()

In [3]:
filenames=getFilesFromPath(hp.DATAPATH)
dataset=[]
text=""
for fn in filenames:
    ntext=convert_text(readTxtFromFile(hp.DATAPATH,fn))
    text+=ntext
    dataset.append(ntext)

dictionary=Dictionary()
dictionary.make_vocab(text,hp.VOCPATH,hp.VOCFILE)
dictionary.load_vocab(hp.VOCPATH,hp.VOCFILE)

199.txt not opened


# Train

In [4]:
import tensorflow as tf
import numpy as np
#from RNN_LSTM_LM import LSTMModel
from helpers import batches_generator
from helpers import Beam
from helpers import beam_search,getBestCandidate

tf.reset_default_graph()

  from ._conv import register_converters as _register_converters


In [5]:
# modifications
# 1. add training/inference variable
# 2. define placeholder for state [NONE,2*stateLength]
# 3. if inference use state placeholder
# 4. modify beam search
# 5. try to use built-in beam search

import tensorflow as tf
import numpy as np
from hyper import HyperParameters
hp=HyperParameters()

class LSTMModel():
    def __declare_placeholders(self):
        """Specifies placeholders for the model."""

        # Placeholders for input and ground truth output.
        self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
        self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
        self.initial_state_h= tf.placeholder(dtype=tf.float32, shape=[None, hp.n_hidden_rnn], name='initial_state_h')
        self.initial_state_c= tf.placeholder(dtype=tf.float32, shape=[None, hp.n_hidden_rnn], name='initial_state_c')

        # Placeholder for lengths of the sequences.
        self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 

        # Placeholder for a dropout keep probability. If we don't feed
        # a value for this placeholder, it will be equal to 1.0.
        self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, dtype=tf.float32), shape=[])

        # Placeholder for a learning rate (tf.float32).
        self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_ph')

    def __build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
        """Specifies bi-LSTM architecture and computes logits for inputs."""

        # Create embedding variable (tf.Variable) with dtype tf.float32
        initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
        embedding_matrix_variable = tf.Variable(initial_value=initial_embedding_matrix, dtype=tf.float32, name='embeddings_matrix')

        # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units 
        # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.

        forward_cell =  tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn),
                                                      input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)
        #backward_cell = tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn), input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)

        # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
        # Shape: [batch_size, sequence_len, embedding_dim].
        embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)

        # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
        # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
        # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
        rnn_output, self.states = tf.nn.dynamic_rnn(cell=forward_cell,
                                          initial_state=tf.nn.rnn_cell.LSTMStateTuple(self.initial_state_c, self.initial_state_h),
                                          sequence_length=self.lengths,
                                          dtype=tf.float32,
                                          inputs=embeddings)
        #rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

        # Dense layer on top.
        # Shape: [batch_size, sequence_len, n_tags].   
        self.logits = tf.layers.dense(rnn_output, vocabulary_size, activation=None)

    def __compute_predictions(self):
        """Transforms logits to probabilities and finds the most probable tags."""

        # Create softmax (tf.nn.softmax) function
        self.softmax_output = tf.nn.softmax(logits=self.logits)

        # Use argmax (tf.argmax) to get the most probable tags
        # Don't forget to set axis=-1
        # otherwise argmax will be calculated in a wrong way
        self.predictions = tf.argmax(self.softmax_output, axis=-1)
        
    def __compute_loss(self, vocabulary_size, PAD_index):
        """Computes masked cross-entopy loss with logits."""

        # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)
        ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, vocabulary_size)
        #loss_tensor = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=ground_truth_tags_one_hot)
        mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), dtype=tf.float32)
        loss_tensor = tf.contrib.seq2seq.sequence_loss(
            logits=self.logits,
            targets=self.ground_truth_tags,
            weights=mask)
        # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)
        # Be careful that the argument of tf.reduce_mean should be
        # multiplication of mask and loss_tensor.
        self.loss = tf.reduce_mean(loss_tensor)#np.multiply(mask, loss_tensor))
        
    def __perform_optimization(self):
        """Specifies the optimizer and train_op for the model."""

        # Create an optimizer (tf.train.AdamOptimizer)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
        self.grads_and_vars = self.optimizer.compute_gradients(self.loss)

        # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
        # Pay attention that you need to apply this operation only for gradients 
        # because self.grads_and_vars contains also variables.
        # list comprehension might be useful in this case.
        clip_norm = tf.cast(1.0, dtype=tf.float32)  ##??
        self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in self.grads_and_vars]

        self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
        
    def __init__(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
        self.__declare_placeholders()
        self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
        self.__compute_predictions()
        self.__compute_loss(n_tags, PAD_index)
        self.__perform_optimization()
        
    def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
        feed_dict = {self.input_batch: x_batch,
                     self.ground_truth_tags: y_batch,
                     self.initial_state_h: np.zeros((lengths.shape[0],hp.n_hidden_rnn)),
                     self.initial_state_c: np.zeros((lengths.shape[0],hp.n_hidden_rnn)),
                     self.learning_rate_ph: learning_rate,
                     self.dropout_ph: dropout_keep_probability,
                     self.lengths: lengths}

        session.run(self.train_op, feed_dict=feed_dict)
 
    def predict_for_batch(self, session, x_batch,init_c,init_h):
        lengths=np.array([1000000]*len(x_batch))
        feed_dict = {self.input_batch: x_batch,
                     self.initial_state_h: init_h,
                     self.initial_state_c: init_c,
                     self.lengths: lengths}
        k=3
        softmax, states = session.run([self.softmax_output ,self.states], feed_dict=feed_dict)
        return softmax, states

In [6]:
model = LSTMModel(vocabulary_size=len(dictionary.word2idx), n_tags=len(dictionary.word2idx), embedding_dim=hp.embedding_dim,
                  n_hidden_rnn=hp.n_hidden_rnn, PAD_index=dictionary.word2idx['<PAD>'])
sess = tf.Session()
sess.run(tf.global_variables_initializer())
#model_checkpoint = './model.chkpt'
saver = tf.train.Saver()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [77]:
from IPython.display import clear_output
learning_rate=hp.learning_rate
print('Start training... \n')
results=""
start=16
for epoch in range(start,hp.n_epochs):
    newc=getBestCandidate(sess,model,150,"I",dictionary)
    results="\nEpoch {}: {}".format(epoch,newc)
    # For each epoch evaluate the model on train and validation data
    #print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(hp.n_epochs) + '-' * 20+results)
    
    counter=0
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(hp.batch_size, dataset,dictionary):
        
        clear_output(wait=True)
        counter=counter+1
        print("ُEboch {}/{}.batch {}/{} {}".format(epoch+1,hp.n_epochs,counter,len(dataset)//hp.batch_size,results))
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, hp.dropout_keep_probability)
    # Decaying the learning rate
    saver.save(sess, hp.MODEL_CHKPNT_PATH,global_step=epoch)
    learning_rate = learning_rate / 1.61#hp.learning_rate_decay
    
print('...training finished.')

ُEboch 50/50.batch 511/511 
Epoch 49: <START> I ' ve got to be able to be a lot of chances , " he said <UNK> " It ' s a lot of people have a lot of chances , " he said <UNK> " It ' s a lot of chances , " he told BBC Radio Five Live <UNK> " It ' s not going to be a good game <UNK> " I ' m sure it ' s a lot of chances , " he added <UNK> " I ' m not going to play <UNK> " I ' m not going to be a lot of people <UNK> " I ' m not going to be a lot of chances , " he told BBC Sport <UNK> " I ' m not going to be able to be a lot of chances <UNK> " I ' m not going to be a lot of people
...training finished.


In [78]:
%time
# Generate text of 1000 words
full="""The pair are facing lengthy bans for the missed tests, including one on the eve of last year's Athens Olympics. 
They were set to learn their fate by the end of February, but late evidence from them has pushed the date back. "A decision is now expected by around mid-March," said one of their lawyers, Michalis Dimitrakopoulos."""
print(getBestCandidate(sess,model,500,'The pair are facing lengthy bans for the missed tests, including',dictionary))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.2 µs
<START> The pair are facing lengthy bans for the missed <UNK> including the game <UNK> " I ' m not going to be able to be able to be able to be able to play <UNK> " I ' m not going to be able to be able to get the game <UNK> " I ' ve got a lot of people who has been a good game <UNK> " I ' m not going to be a lot of chances <UNK> " It ' s a lot of people who have a good game , " said <UNK> " I ' ve got to get a lot of chances , " he said <UNK> " I ' m sure it ' s a lot of chances , " he said <UNK> " It ' s not going to be a lot of people <UNK> " I ' m not going to be able to be a lot of people <UNK> " I ' ve got to play on the pitch <UNK> " I ' m not going to be able to be able to play <UNK> " I ' m not going to be a lot of chances <UNK> " I ' m not going to be a good game <UNK> " I ' m sure it ' s a lot of chances , " he said <UNK> " I ' ve got to be a lot of people who has been a lot of people who has been a lot of peopl

In [None]:
#Restore check-point
tf.reset_default_graph()  
model = LSTMModel(vocabulary_size=len(dictionary.word2idx), n_tags=len(dictionary.word2idx), embedding_dim=hp.embedding_dim,
                  n_hidden_rnn=hp.n_hidden_rnn, PAD_index=dictionary.word2idx['<PAD>'])
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
saver.restore(sess,hp.MODEL_CHKPNT_PATH+'-1')

In [32]:
#todo

# version2
# Evaluation

# char-level

# larger network

# Russian
# Arabic 
# French