<a href="https://colab.research.google.com/github/tranvohuy/Markovify_sentence_Truyen_Kieu/blob/master/Vanilla_char_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this note we investigate a LSTM way to produce new characters. There are two codes, 
- https://gist.github.com/vinhkhuc/7ec5bf797308279dc587: pure Tensorflow code
- http://karpathy.github.io/2015/05/21/rnn-effectiveness/: keras layer code

Both are based on this https://github.com/oxford-cs-ml-2015/practical6/blob/master/train.lua 

Other reference:
- https://github.com/AvoncourtPartners/poems
- https://cloud.google.com/blog/products/gcp/cloud-poetry-training-and-hyperparameter-tuning-custom-text-models-on-cloud-ml-engine 

# We first import an original text from an author.

In [0]:
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen

def crawling_oneweb(url):
  html = urlopen(url).read()
  soup = BeautifulSoup(html)

  main_body = soup.findAll("p", {"class": "Normal"})
  del main_body[-1]
  text = "".join([p.text for p in main_body])
  return text  

def crawlingwebs(urls):
  '''
  Input:
  urls: a list of urls
  
  Output:
  a combination text extracted from url in urls
  '''
  text = ''
  for url in urls:
    text = text + crawling_oneweb(url)
  return text
  
urls = ["https://vnexpress.net/goc-nhin/nguoi-giau-va-thien-tai-3808335.html",
       'https://vnexpress.net/goc-nhin/thoat-khoi-co-don-3886302.html',
       'https://vnexpress.net/goc-nhin/lam-luat-3900478.html',
       'https://vnexpress.net/goc-nhin/thuat-san-dat-vang-3879380.html',
       'https://vnexpress.net/goc-nhin/nhung-mua-xuan-trong-doi-3877957.html',
       'https://vnexpress.net/goc-nhin/than-phan-ruong-dong-3862785.html',
       'https://vnexpress.net/goc-nhin/long-dan-3824537.html',
       'https://vnexpress.net/goc-nhin/ro-rang-voi-dat-3764906.html',
       'https://vnexpress.net/goc-nhin/thuong-nho-dong-bang-3778190.html']

text = crawlingwebs(urls)

# pure tensorflow

- The network below is a simple RNN ([Elman network](https://en.wikipedia.org/wiki/Recurrent_neural_network)), not LSTM.

- It is totally possible written in keras language.

In [0]:
"""
Vanilla Char-RNN using TensorFlow by Vinh Khuc (@knvinh).
Adapted from Karpathy's min-char-rnn.py
https://gist.github.com/karpathy/d4dee566867f8291f086
Requires tensorflow>=1.0
BSD License
"""
import random
import numpy as np
import tensorflow as tf

seed_value = 42 # fix some random generator. 42 is a famous generic number from Hitch hiker
tf.set_random_seed(seed_value)
random.seed(seed_value)

#np.eye(vocab_size) is the unit matrix of dimension vocab_size x vocab_size

def one_hot(v):
    return np.eye(vocab_size)[v]
# if v = [i1,i2,i3] then the one_hot(v) returns the rows i1, i2, i3 of np.eye(vocab_size)
# That means it return the one_hot encoding of v with respect to vocab_size.
  
data = text
chars = sorted(list(set(text))) # the set of different characters in the text
data_size, vocab_size = len(data), len(chars)
#vocab_size means char_size
#data_size: total characters in the originial text

print('Data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

# Hyper-parameters
hidden_size   = 100  # hidden layer's size
seq_length    = 25   
# number of characters for each sample. It is timesteps in keras LSTM


learning_rate = 1e-1


inputs     = tf.placeholder(shape=[None, vocab_size], dtype=tf.float32, name="inputs")
#the inputs here means each training sample.
# it has the shape (None, vocab_size)
targets    = tf.placeholder(shape=[None, vocab_size], dtype=tf.float32, name="targets")


init_state = tf.placeholder(shape=[1, hidden_size], dtype=tf.float32, name="state")

initializer = tf.random_normal_initializer(stddev=0.1)

#hs_t, hprev, and the like: the return sequences

#https://en.wikipedia.org/wiki/Recurrent_neural_network
#Below is the Elman network. A little bit different presentation from the formula in wikipedia
#output_softmax here is the same as y_t in wikipedia
with tf.variable_scope("RNN") as scope:
    hs_t = init_state
    ys = []
    for t, xs_t in enumerate(tf.split(inputs, seq_length, axis=0)):
      # https://www.tensorflow.org/api_docs/python/tf/split
        if t > 0: scope.reuse_variables()  # Reuse variables
        Wxh = tf.get_variable("Wxh", [vocab_size, hidden_size], initializer=initializer)
        Whh = tf.get_variable("Whh", [hidden_size, hidden_size], initializer=initializer)
        Why = tf.get_variable("Why", [hidden_size, vocab_size], initializer=initializer)
        bh  = tf.get_variable("bh", [hidden_size], initializer=initializer)
        by  = tf.get_variable("by", [vocab_size], initializer=initializer)

        hs_t = tf.tanh(tf.matmul(xs_t, Wxh) + tf.matmul(hs_t, Whh) + bh)
        ys_t = tf.matmul(hs_t, Why) + by
        ys.append(ys_t)

hprev = hs_t
output_softmax = tf.nn.softmax(ys[-1])  # Get softmax for sampling

outputs = tf.concat(ys, axis=0)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=outputs))

# Minimizer
minimizer = tf.train.AdamOptimizer()
grads_and_vars = minimizer.compute_gradients(loss)

# Gradient clipping
grad_clipping = tf.constant(5.0, name="grad_clipping")
clipped_grads_and_vars = []
for grad, var in grads_and_vars:
    clipped_grad = tf.clip_by_value(grad, -grad_clipping, grad_clipping)
    clipped_grads_and_vars.append((clipped_grad, var))

# Gradient updates
updates = minimizer.apply_gradients(clipped_grads_and_vars)

# Session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# Initial values
n, p = 0, 0
hprev_val = np.zeros([1, hidden_size])

while True:
    # Initialize
    if p + seq_length + 1 >= len(data) or n == 0: # i.e. we have run through one epoch (whole text)
        hprev_val = np.zeros([1, hidden_size])
        p = 0  # reset

    # Prepare inputs
    input_vals  = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    #input_vals shape is seq_length 
    # it is data[p: p +seq_length] from the original text
    target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
    # output_vals shape is seq_length x vocab_size
    # output_vals is  input_vals shifted by 1 character
    
    input_vals  = one_hot(input_vals)
    #input_vals shape is seq_length x vocab_size
    target_vals = one_hot(target_vals)
    #target_vals shape is seq_length x vocab_size
    
    # compute h and loss and updates
    # hpre_val is the output of each
    hprev_val, loss_val, _ = sess.run([hprev, loss, updates],
                                      feed_dict={inputs: input_vals,
                                                 targets: target_vals,
                                                 init_state: hprev_val})
    if n % 500 == 0:
        # Progress
        print('iter: %d, p: %d, loss: %f' % (n, p, loss_val))

        # Do sampling
        sample_length = 200 #we want to sample 200 characters
        start_ix      = random.randint(0, len(data) - seq_length)
        #choose a random integer from 0 to len(data)-seq_length-1
        print('star_ix', start_ix)
        sample_seq_ix = [char_to_ix[ch] for ch in data[start_ix:start_ix + seq_length]]
        #sample_seq_ix will be the input for prediction
        
        #        
        ixes          = []
        sample_prev_state_val = np.copy(hprev_val)

        for t in range(sample_length):
            sample_input_vals = one_hot(sample_seq_ix)
            sample_output_softmax_val, sample_prev_state_val = \
                sess.run([output_softmax, hprev],
                         feed_dict={inputs: sample_input_vals, init_state: sample_prev_state_val})
            #np.darray.ravel() return a flat array
            ix = np.random.choice(range(vocab_size), p=sample_output_softmax_val.ravel())
            
            ixes.append(ix)
            sample_seq_ix = sample_seq_ix[1:] + [ix]

        txt = ''.join(ix_to_char[ix] for ix in ixes)
        print('----\n %s \n----\n' % (txt,))

    p += seq_length
    n += 1

In [0]:
# the above code is equivalent to the following keras structure


In [0]:
input_vals  = [char_to_ix[ch] for ch in data[p:p + seq_length]]
target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]
print(input_vals)
print(target_vals)

[1, 49, 94, 59, 53, 1, 53, 55, 47, 59, 1, 65, 63, 66, 76, 59, 9, 1, 26, 60, 59, 1, 122, 58, 1]
[49, 94, 59, 53, 1, 53, 55, 47, 59, 1, 65, 63, 66, 76, 59, 9, 1, 26, 60, 59, 1, 122, 58, 1, 92]


# Keras

- https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

- https://keras.io/layers/recurrent/#lstm

In [0]:
from __future__ import print_function
import keras
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

'''
text = 'Tôi còn nhớ một lời thoại trong vở kịch "Hồn Trương Ba, da hàng thịt" của cố tác giả Lưu Quang Vũ. K.....'
'''
print('The number of character in the originial text:', len(text)) #==50088

chars = sorted(list(set(text))) 
print('The number of different characters in the original text:', len(chars)) #==147
char_indices = dict((c, i) for i, c in enumerate(chars))
'''
char_indices = {'\n':0, ' ':1, '!': 2, ...
              'A': 24, 'B': 25, ...}
'''
indices_char = dict((i, c) for i, c in enumerate(chars))
'''
indices_char = {0: '\n', 1: ' ', 2: '!', ...
            24: 'A', 25: 'B', ...}
}
'''

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40 # this is the number of consecutive characters put in the input (each tranining )
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of training/sequences:', len(sentences)) #==16683
'''
sentences = ['Tôi còn nhớ một lời thoại trong vở kịch ', 
          ' còn nhớ một lời thoại trong vở kịch "Hồ', 
          'n nhớ một lời thoại trong vở kịch "Hồn T',
          ...]
next_chars = ['"', 'n', 'r', 'n', 'B',...]
'''

print('One hot encoding...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

# x is a zero tensor of dim 16683 x 40 x 147
# y is a zero tensor of dim 16683 x 147
# 16687 can be thought of the number of samples/training samples

# x[i,t,:] stands for  a 147-d one-hot vector of the word t-th in sentences[i]
# y[i,:] is a 147-d one-hot vector for next_chars[i]

#x, y are boolean types instead of float. Since it will run faster with 
#large character sets

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()


model.add(LSTM(128, input_shape=(maxlen, len(chars))))
#so input_dim = len(chars), input_length = maxlen
#the output of this layer is a vector of dim 128. This is the last vector
#output of the LSTM. All other vectors, (maxlen-1) of them, are discarded.

# 'last vector output' because return_sequences = False by default in LSTM
# 

model.add(Dense(len(chars), activation='softmax'))
# add another Dense layer with softmax activation
# output is a vector of length len(chars)
# sum of its element is one.

optimizer = RMSprop(lr=0.01)
#optimizer: a strategy to do gradient descend/ finding minimum of loss
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
#we can write 
#model.compile(loss='categorical_crossentropy', optimizer = 'RMSprop')
#but then it won't give the option to choose learning rate of RMSprop. 
#the default lr of RMSprop is lr=0.001

#However we can write
#model.compile(loss='categorical_crossentropy', optimizer = RMSprop(lr=0.01))
#we use categorical_crossentropy since this is a categorical/classification problem
# guessing the next character

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    #So preds = softmax(log(original preds)/temperature)
    # that means
    # preds = (original preds)^(1/temperature) / (sum)
    # really funny
    
    
    # if temperature is small, then it's very cold, everything is frozen
    # hence if old_preds[i] is big, then preds[i] is very close to 1
    # we can predict what will appear
    # if temperature is extremely small, 
    # then the return is almost argmax(old_preds)
    # which is what we dream of.
    
    # if temperature is big, then it's hot, everything is chaos
    # we can not predict what will appear
    # all preds[i]'s tend to be the same,
    # that is, preds converges to discrete uniform measure
    # hence the probability to choose each character is 1/total diff_character
    # which is 1/147
    
    probas = np.random.multinomial(1, preds, 1)
    # first argument 1-> n, last argument->k
    #An experiment is throwing a dice n times (i.e once in this case). 
    #This dice has len(preds) values. 
    #each value has probability preds[value]
    #Do the experement k times
    #The output of each experiment is a vector of len(preds)
    # the output of multinomial is a vector of m x len(preds)
    # the sum of all element of this vector is n
    
    return np.argmax(probas)
    #return index with highest values

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    # it is a method/property of keras.callback
    # An example. We can set up to do something after every 5 epochs,
    # instead of every epoch
    
    
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    #start_index = random.randint(0, len(text) - maxlen - 1)
    start_index=0 # choose a fixed seed sentence, or we can randomize start_index
    for diversity in [0.01, 0.2, 1.0, 10.0]:
        print('----- diversity:', diversity)

        generated = ''
        #choose a sentence in sentences
        # we can randomize start_index
        # sentence = text[start_index: start_index + maxlen]
        # or you can choose your own sentence with maxlen characters
        #sentence = 'Trăm năm trong cõi người ta, chữ tình ch'
        sentence = 'I do not understand that you are saying '
        #sentence = 'Trăm tiền trong cõi nbl ti ta, tình kông'
        
        
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        #print is just a wrapper of sys.stdout.write
        sys.stdout.write(generated)
        
                
        #produce 400  new characters
        for i in range(400):
            #one hot encoding of sentence
            x_pred = np.zeros((1, maxlen, len(chars)))
            
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.
            
            #then put x_pred into model.predict
            # without [0] the shape of preds is (1,147) not (147,)
            preds = model.predict(x_pred, verbose=0)[0]
            
            # preds is a probability density from [0,146]
            # presumably sum(preds) = 1
            next_index = sample(preds, diversity)
            #a new index is chosen. Transform it to a corresonding character
            
            next_char = indices_char[next_index]

            generated += next_char # this code is currently redundant
            
            
            #update sentence to predict the next character
            # we remove the first character and append the new character
            sentence = sentence[1:] + next_char
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print() #new line

print_callback = LambdaCallback(on_epoch_end = on_epoch_end)
# https://keras.io/callbacks/#lambdacallback




model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Using TensorFlow backend.


The number of character in the originial text: 50088
The number of different characters in the original text: 147
nb sequences: 16683
One hot encoding...
Build model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/60

----- Generating text after Epoch: 0
----- diversity: 0.01
----- Generating with seed: "I do not understand that you are saying "
I do not understand that you are saying thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiện thiệ
----- diversity: 0.2
----- Generating with seed: "I do not understand that you are saying "
I do not understand that you are say



ng cho chuyển thiên nhiên điều tà các là cấp gian được cho như giản đi nhiều chu giả cho chua từ thế nào động đất đai cho thuê đất, cho thuy thịp là cấp thốc thi trong nhiều không phải chi thế giới, nhà và cùng là chuyện khái người pháp luật vàng chúng, hộ nước mạnh vi cân chút mất các định suông. Người làng thiên 
----- diversity: 0.2
----- Generating with seed: "I do not understand that you are saying "
I do not understand that you are saying trong hiện Nam trong nhiều người làng theo tiết của dân chúng ta đại và hiện tại và thực trong là chuyển đổi kháph thu hồi đổi thành của dân chuyển thiên nhiên của cả độ thuê để chuyển sang nhưng chún nghiệp hành vi cho chuyển thiên nhiên đã hài là quy đị, tế hại của thành cho thuyển đổi thàm cho trong hiện Nam trong hiện trở thị hương và chuy trị pháp luật đất đai can của dân chuyển cả cho chua t
----- diversity: 1.0
----- Generating with seed: "I do not understand that you are saying "
I do not understand that you are saying cân cơ chế bằng sá

<keras.callbacks.History at 0x7fca31b35470>

 ## Other remarks:
 - The network learn the structure of characters in the original text. If we give it a sentence with new character, it will fail :D
 - 

## Questions:
- What happens if we choose the seed_sentence that has no sense
  - as a sentence, e.g., 'I you me him hello' 
  - as words, for example 'I dnoot uredsbadt waht you siad'
  - in different language, 'je'n comprends pas', etc?
- choose to guess the next two characters at the same time, instead of one character
- word-based guessing instead of character-based guessing like above
- 

In [0]:
print('Build model...')
model = Sequential()
model.add(LSTM(2, input_shape=(maxlen, 3), return_sequences = True, name = "LSTM"))
model.add(Dense(len(chars), activation='softmax', name = 'Dense'))


Build model...


In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM (LSTM)                  (None, 40, 2)             48        
_________________________________________________________________
dense_11 (Dense)             (None, 40, 147)           441       
Total params: 489
Trainable params: 489
Non-trainable params: 0
_________________________________________________________________


In [0]:
import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
model = Sequential()
model.add(LSTM(5, input_shape = (10, 20), return_sequences = True))
model.add(TimeDistributed(Dense(2)))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 10, 5)             520       
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 2)             12        
Total params: 532
Trainable params: 532
Non-trainable params: 0
_________________________________________________________________


In [0]:
model = Sequential()
model.add(LSTM(5, input_shape = (10, 20), return_sequences = True))
model.add((Dense(1)))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 10, 5)             520       
_________________________________________________________________
dense_4 (Dense)              (None, 10, 1)             6         
Total params: 526
Trainable params: 526
Non-trainable params: 0
_________________________________________________________________


In [0]:
import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
hidden_neurons = 10
time_steps = 75
input_dima = 3
train_set = np.random.randn(100,time_steps, input_dim )
model = Sequential()  
model.add(LSTM(units = hidden_neurons, input_shape= (time_steps,10)))  
model.add(keras.layers.RepeatVector(10))
model.add(LSTM(units = hidden_neurons, return_sequences = True))  
model.add(TimeDistributed(Dense(units = 1)))
model.add(Activation('linear'))   
model.compile(loss = 'mean_squared_error', optimizer = 'rmsprop', metrics = ['accuracy'])

NameError: ignored

In [0]:

import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
data_dim = 2
X_train_pass = np.random.randn(3281, data_dim)
y_train_pass = np.random.randn(3181,1)

timesteps = 1

train_pass = np.reshape(X_train_pass, (X_train_pass.shape[0], 1, X_train_pass.shape[1]))
#test_pass = np.reshape(X_test_pass, (X_test_pass.shape[0], 1, X_test_pass.shape[1]))
y_train_pass = np.reshape(y_train_pass, (y_train_pass.shape[0], 1, y_train_pass.shape[1]))

model_pass = Sequential()
model_pass.add(LSTM(units=64,  return_sequences=True, 
                input_shape=(timesteps, data_dim)))
model_pass.add(Dense(2, activation='sigmoid'))
model_pass.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model_pass.summary()
model_pass.fit(train_pass, y_train_pass,batch_size=1, epochs = 1, verbose = 1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 1, 64)             17152     
_________________________________________________________________
dense_9 (Dense)              (None, 1, 2)              130       
Total params: 17,282
Trainable params: 17,282
Non-trainable params: 0
_________________________________________________________________


ValueError: ignored

In [0]:
from keras.layers import Flatten
def model():
    model = Sequential()
    model.add(Dense(128, input_shape = (LSTM_WINDOW_SIZE,1)))
    model.add(LSTM(units=5,
                   return_sequences=True))
    model.add(Flatten())
  
    model.add(Dense(1, activation = 'linear'))

    model.add(Dense(1))
    return model
LSTM_WINDOW_SIZE = 5
model3 = model()
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_34 (Dense)             (None, 5, 128)            256       
_________________________________________________________________
lstm_23 (LSTM)               (None, 5, 5)              2680      
_________________________________________________________________
flatten_3 (Flatten)          (None, 25)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 26        
_________________________________________________________________
dense_36 (Dense)             (None, 1)                 2         
Total params: 2,964
Trainable params: 2,964
Non-trainable params: 0
_________________________________________________________________


In [0]:
def model2():
    X = keras.layers.Input(shape=(LSTM_WINDOW_SIZE,1))
     X = Dense(128)(X)
    X = LSTM(units = 5, return_sequences = True)(X)
    


IndentationError: ignored

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 5, 128)            256       
_________________________________________________________________
lstm_21 (LSTM)               (None, 5, 5)              2680      
_________________________________________________________________
dense_29 (Dense)             (None, 5, 1)              6         
_________________________________________________________________
flatten_1 (Flatten)          (None, 5)                 0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 6         
Total params: 2,948
Trainable params: 2,948
Non-trainable params: 0
_________________________________________________________________
