In [1]:
import numpy as np

In [2]:
from lib.dictionarymd import Dictionary
from lib.textprocessingmd import convert_text
from lib.fileoperationmd import getFilesFromPath,readTxtFromFile

[nltk_data] Downloading package stopwords to /home/zein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
class HyperParameters:
    DATAPATH='./data/shakespeare/'
    VOCPATH='./preprocessed'
    VOCFILE='articles.voc'

    MAXLENGTH=10000  # MAX Length of the document
hp=HyperParameters()

In [4]:
filenames=getFilesFromPath(hp.DATAPATH)
dataset=[]
text=""
for fn in filenames:
    ntext=convert_text(readTxtFromFile(hp.DATAPATH,fn))
    text+=ntext
    dataset.append(ntext)
    break

dictionary=Dictionary()
dictionary.make_vocab(text,hp.VOCPATH,hp.VOCFILE)
dictionary.load_vocab(hp.VOCPATH,hp.VOCFILE)

In [5]:
# I can understand from this code that multiple batches maybe with different maxlen
def batches_generator(batch_size, docs,dictionary,
                      shuffle=False, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(docs)
    vecs=[dictionary.text2vec(doc)[:hp.MAXLENGTH] for doc in docs]
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        lengths=[len(s) for s in vecs]
        order = np.argsort(lengths)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(vecs[idx][:-1])
            y_list.append(vecs[idx][1:])
            max_len_token = max(max_len_token, len(vecs[idx]))  #why?!
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * dictionary.word2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * dictionary.word2idx['<PAD>']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

In [6]:
for x,y,lengths in batches_generator(1, dataset[:2],dictionary):
    print(x.shape,y.shape,lengths.shape,lengths)

(1, 10000) (1, 10000) (1,) [9999]


## Build a recurrent neural network

In [7]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [8]:
class LSTMModel():
    pass

In [9]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""

    # Placeholders for input and ground truth output.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
  
    # Placeholder for lengths of the sequences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    # Placeholder for a dropout keep probability. If we don't feed
    # a value for this placeholder, it will be equal to 1.0.
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, dtype=tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_ph')

In [10]:
LSTMModel.__declare_placeholders = classmethod(declare_placeholders)

In [11]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
    """Specifies bi-LSTM architecture and computes logits for inputs."""
    
    # Create embedding variable (tf.Variable) with dtype tf.float32
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix_variable = tf.Variable(initial_value=initial_embedding_matrix, dtype=tf.float32, name='embeddings_matrix')

    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units 
    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.
    forward_cell =  tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn), input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)
    #backward_cell = tf.nn.rnn_cell.DropoutWrapper(cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn), input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph, state_keep_prob=self.dropout_ph, dtype=tf.float32)
                                                  
    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
    # Shape: [batch_size, sequence_len, embedding_dim].
    embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)

    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
    rnn_output, _ = tf.nn.dynamic_rnn(cell=forward_cell, sequence_length=self.lengths, dtype=tf.float32, inputs=embeddings)
    #rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)
    
    # Dense layer on top.
    # Shape: [batch_size, sequence_len, n_tags].   
    self.logits = tf.layers.dense(rnn_output, vocabulary_size, activation=None)

In [12]:
LSTMModel.__build_layers = classmethod(build_layers)

In [13]:
def compute_predictions(self):
    """Transforms logits to probabilities and finds the most probable tags."""
    
    # Create softmax (tf.nn.softmax) function
    self.softmax_output = tf.nn.softmax(logits=self.logits)
    
    # Use argmax (tf.argmax) to get the most probable tags
    # Don't forget to set axis=-1
    # otherwise argmax will be calculated in a wrong way
    self.predictions = tf.argmax(self.softmax_output, axis=-1)

In [14]:
LSTMModel.__compute_predictions = classmethod(compute_predictions)

In [15]:
def compute_loss(self, vocabulary_size, PAD_index):
    """Computes masked cross-entopy loss with logits."""
    
    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, vocabulary_size)
    #loss_tensor = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=ground_truth_tags_one_hot)
    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), dtype=tf.float32)
    loss_tensor = tf.contrib.seq2seq.sequence_loss(
        logits=self.logits,
        targets=self.ground_truth_tags,
        weights=mask)
    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)
    # Be careful that the argument of tf.reduce_mean should be
    # multiplication of mask and loss_tensor.
    self.loss = tf.reduce_mean(loss_tensor)#np.multiply(mask, loss_tensor))

In [16]:
LSTMModel.__compute_loss = classmethod(compute_loss)

In [17]:
def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
    
    # Create an optimizer (tf.train.AdamOptimizer)
    self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
    # Pay attention that you need to apply this operation only for gradients 
    # because self.grads_and_vars contains also variables.
    # list comprehension might be useful in this case.
    clip_norm = tf.cast(1.0, dtype=tf.float32)  ##??
    self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in self.grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [18]:
LSTMModel.__perform_optimization = classmethod(perform_optimization)

In [19]:
def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()

In [20]:
LSTMModel.__init__ = classmethod(init_model)

In [21]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)

In [22]:
LSTMModel.train_on_batch = classmethod(train_on_batch)

In [23]:
def predict_for_batch(self, session, x_batch, lengths):
    feed_dict = {self.input_batch: x_batch,
                 self.lengths: lengths}
    k=3
    predictions = session.run(self.predictions, feed_dict=feed_dict)
    softmax = session.run(self.softmax_output, feed_dict=feed_dict)
    topk=softmax.argsort()[:,:,-k:]
    topkp=softmax[:,:,topk]
    return topk,topkp,softmax

In [24]:
LSTMModel.predict_for_batch = classmethod(predict_for_batch)

# Train

In [25]:
tf.reset_default_graph()

model = LSTMModel(vocabulary_size=len(dictionary.word2idx), n_tags=len(dictionary.word2idx), embedding_dim=200,
                  n_hidden_rnn=200, PAD_index=dictionary.word2idx['<PAD>'])

batch_size = 1
n_epochs = 6
learning_rate = 0.005
learning_rate_decay = 1.41
dropout_keep_probability = 0.6

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [26]:
learning_rate=0.01

In [27]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

In [None]:
from IPython.display import clear_output
learning_rate=0.01
n_epochs=500
print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    
    counter=0
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, dataset,dictionary):
        
        clear_output(wait=True)
        counter=counter+1
        print("ُEboch {}/{}.batch {}/{}".format(epoch+1,n_epochs,counter,len(dataset)//batch_size))
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
    saver.save(sess, './my-model', global_step=epoch,write_meta_graph=False)
    # Decaying the learning rate
    #learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

ُEboch 12/500.batch 1/1


## Beam Search
### input: 
- k 


- C: candidates number k
- P: probabilities Pi is probability of candidate Ci

Extend candidates

- for each candidate:
    - find k new candidates out of current candidate
    - add them to C and probabilties to P
    - keep top k candidates

In [None]:
import random
def beamSearch(k,seed,dictionary):
    C=[[dictionary.word2idx[token] for token in seed.split()]]
    start=len(C[0])
    oldC=[]
    oldP=[]
    P=[1]
    for length in range(start,50):
        clear_output(wait=True)
        print(length)
        clength=np.array([length])
        oldC.append(C)
        oldP.append(P)
        newC=[]
        newP=[]
        for ix,x in enumerate(C):
            x_batch=np.zeros((1,len(x)),dtype=np.int32)
            x_batch[0,:]=x
            retk,retp,softmax=model.predict_for_batch(sess,x_batch,clength)
            retk=retk[0,length-1]
            softmax=softmax[0,length-1,retk]
            
            for iidx,idx in enumerate(retk):
                if idx!=dictionary.word2idx['<UNK>'] :#and idx!=dictionary.word2idx['#endl']:
                    newC.append(x+[idx])
                    newP.append(P[ix]*softmax[iidx])
        #perplexities=np.array(newP,-l)
        argsort=np.argsort(newP)[-3:]
        P,C=[],[]
        for ias in argsort:
            P.append(newP[ias])
            C.append(newC[ias])
        rands=[]
        for i in range(3):
            rand=random.randint(0,len(newP)-1)        
            if length-start>4:
                while rand in argsort or rand in rands:
                    rand=random.randint(0,len(newP)-1)
            rands.append(rand)
            P.append(newP[rand])
            C.append(newC[rand])

    return oldC,oldP,C[2],P[2]
                

In [None]:
oldC,oldP,C,P=beamSearch(1,'we',dictionary)
print(dictionary.vec2text(C))

In [107]:
clength=np.array([250])
retk,retp,softmax=model.predict_for_batch(sess,[[0]],[250])

In [110]:
retk

array([[[7, 4, 1]]])