# Recurrent Neural Network for Modeling Sentences

In this task, we will use RNNs to model sentences. The task is to predict the next character in a sentence. 

In [1]:
# As usual, a bit of setup
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%autosave 180


Autosaving every 180 seconds


## Load the data


In [36]:

import csv
import string
import numpy as np

def load_data(data_file):
    """Load the data into a list of strings"""
    
    with open(data_file) as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        rows = list(reader)

    if data_file == 'train.csv':
        sentences, labels = zip(*rows[1:])
        sentences = list(sentences)
    elif data_file == 'test.csv':
        sentences = [row[0] for row in rows[1:]]
    else:
        print("Can only load 'train.csv' or 'test.csv'")
    
    # replace non ascii chars to spaces
    count = 0
    for i, sen in enumerate(sentences):
        count = count + sum([0 if ord(i) < 128 else 1 for i in sen])
        
        # '\n' indicates the end of the sentence
        sentences[i] = ''.join([i if ord(i) < 128 else ' ' for i in sen]) + '\n'
        
    print('The total of ', count, 'non-ascii chars are removed \n')

    return sentences

def char_to_index(sentence, str_voc):
    """Convert a string to an array by using the index in the vocabulary"""
    
    sen_int = np.array([str_voc.index(c) for c in sentence])
    return sen_int

def convert_sen_to_data(sentences, str_voc):
    """ Convert a list of strings to a list of numpy arrays"""
    data = [None] * len(sentences)
    for i, sen in enumerate(sentences):
        data[i] = char_to_index(sen, str_voc)
        
        # sanity check
        #if i < 5:
        #    recover = "".join([str_voc[k] for k in data[i]])
        #    print(recover)
    return data


train_sentences = load_data('train.csv')

# NOTE: you need to use the same vocabulary to handle your test sentences
vocabulary = list(set("".join(train_sentences))) 
str_voc = "".join(vocabulary)

train_data = convert_sen_to_data(train_sentences, str_voc)


num_sen = len(train_data)
sen_lengths = [sen.shape[0] for sen in train_data]
max_len = max(sen_lengths)
min_len = min(sen_lengths)
num_chars = sum(sen_lengths)

print('Data statistics:')
print('Number of sentences: ', num_sen)
print('Maximum and minimum sentence lengths:', max_len, min_len)
print('Total number of characters:', num_chars)
print('Vocabulary size: ', len(vocabulary))

uniq, uniq_counts = np.unique(np.concatenate(train_data), return_counts=True)
freq = np.zeros_like(uniq_counts)
freq[uniq] = uniq_counts

print('Chars in vocabulary and their frequencies:')
print(list(zip(vocabulary, freq.tolist())))
    

The total of  4328 non-ascii chars are removed 

Data statistics:
Number of sentences:  160000
Maximum and minimum sentence lengths: 100 32
Total number of characters: 10954565
Vocabulary size:  95
Chars in vocabulary and their frequencies:
[('l', 371704), ('-', 20064), ('D', 6787), ('q', 6356), ('#', 496), ('F', 3232), ('!', 12100), ('y', 209349), ('7', 2496), ('_', 107), ('T', 15062), (':', 22223), ('m', 225041), ('H', 11482), ('Y', 2381), ('>', 9), (',', 33680), ('V', 720), ('$', 1212), ('W', 37161), (';', 607), ('&', 1366), ('S', 7281), ('B', 4063), ('.', 108694), ('k', 111404), ('+', 123), ('b', 148176), ('<', 12), ('n', 552588), ('3', 3517), ('P', 3722), ('=', 103), ('p', 184115), ('N', 3017), ('a', 726754), ('c', 253811), ('?', 48816), ('R', 2942), ('t', 698276), ('[', 1), ('h', 397259), (' ', 1762678), ('o', 684697), ('f', 163468), ('^', 322), ('{', 9), ('0', 11139), ('@', 34), ('~', 133), ("'", 88729), ('4', 2882), ('Q', 1036), ('j', 23898), ('|', 66), ('g', 191416), (')', 889

### Implement an RNN and a GRU with tensorflow

**Q7 (10 points)** In this problem, you are supposed to train a recurrent neural network to model sentences. Particuarly, your model will receive 10 starting characters and should predict the rest of sentence. The model will be evaluated by per-character cross-entropy loss. You will get 
* 5 points if your per-character cross-entropy loss is less than 3.13 (the loss by predicting with character frequencies). 
* 8 points if your per-character cross-entropy loss is less than 2
* 10 points if your per-character cross-entropy loss is less than 1.5

\*The performance from a [paper](https://arxiv.org/pdf/1808.04444.pdf) indicates that an LSTM can achieve performance of 1.43 * ln(2) = 0.991. 
\*The `zip` program for compressing files roughly can achieve a performances of 3.522 bits per character. It corresponds to a performance of  3.522 * ln(2) = 2.441

In [76]:
## Create RNN and train the model
## NOTE: you may want to put this part of code in a separate .py file

class StubLayer(tf.keras.layers.Layer):
  """A stub model, which just returns zero. """
  def __init__(self, max_len, voc_size, freq):
    super(StubLayer, self).__init__()
    self.max_len = max_len
    self.voc_size = voc_size
    self.log_freq = np.log(freq.astype(np.float32))
    
    self.dense1 = tf.keras.layers.Dense(self.voc_size)

  def call(self, inputs):  
    """Predict logits for the next character 
    args:
        inputs: a integer tensor with shape (1,) indicating the previous character.
                Also include hidden states specified by the model
        
    returns:
        outputs: a tensor with shape (voc_size, )
    
    """    
    
    naive_pred = tf.constant(self.log_freq)
    
    outputs = self.dense1(inputs) * 0 + naive_pred
    
    return outputs


class NaiveCell(tf.keras.layers.Layer):
    def __init__(self, freq, units=freq.shape[0]):
        super(NaiveCell, self).__init__()
        
        self.log_freq = np.log(freq.astype(np.float32))
    
        self.state_size = units 
        self.output_size = units
        
    def call(self, inputs, states):
        prev_output = states[0]
        output = prev_output * 0 + self.log_freq
        return output, [output]

cell = NaiveCell(freq=freq)

naive_rnn = tf.keras.layers.RNN(cell)


# define a stub model and save it
# NOTE: in prediction, the input to the model is a matrix of shape [batch_size, max_len+1]. 
# The first column of the matrix is always a space `str_voc.index(' ')`. 
# The output of the model should has shape [batch_size, max_len, voc_size]

#model = tf.keras.Sequential()
#model.add(tf.keras.layers.InputLayer(input_shape=(None, 1,)))
#model.add(StubLayer(max_len=max_len, voc_size=len(str_voc), freq=freq))
#model.add(naive_rnn)
#model.compile(optimizer="Adam", loss="mse", metrics=["mae"])


naive_rnn = tf.keras.layers.SimpleRNN(freq.shape[0], activation='linear',
                                 kernel_initializer=tf.keras.initializers.Zeros(),
                                 recurrent_initializer=tf.keras.initializers.Zeros(), 
                                 bias_initializer=tf.keras.initializers.Constant(np.log(freq)))

model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(None, 1,)))
model.add(naive_rnn)
model.compile(optimizer="Adam", loss="mse", metrics=["mae"])

model.save('rnn_lm.mod') 


INFO:tensorflow:Assets written to: rnn_lm.mod/assets


### Test the trained model

In [104]:

# load the test data. NOTE: need to use the same vocabulary as the training data
test_sentences = load_data('test.csv')
test_data = convert_sen_to_data(test_sentences, str_voc)


print('Number of test instances:', len(test_data))

# TODO: replace this stub model with your powerful model
model = tf.keras.models.load_model('rnn_lm.mod')

test_mat = tf.keras.preprocessing.sequence.pad_sequences(test_data, maxlen=max_len, 
                                                         padding='post', truncating='post',
                                                         value=-1)
padding_flag = (test_mat < 0).astype(np.int32)

print('Evaluating the model ...')

loss_sum = 0

model.reset_states()
for t in range(max_len):

    # the input is -1 at beginning and then the character in the previous step afterwards
    if t == 0:
        char_input = - np.ones([test_mat.shape[0], 1, 1])
    else:
        char_input = np.reshape(test_mat[:, t - 1], [test_mat.shape[0], 1, 1])
        
    # predict is a matrix with shape [test_set_size, voc_size]
    predicts = model(char_input)
    
    # groundtruth is the character at the current step
    cur_chars = test_mat[:, t]
    cur_chars[cur_chars < 0] = 0 # change -1 to 0 since -1 is illegal for one-hot encoding
    labels = tf.one_hot(cur_chars, depth=len(str_voc))
    
    
    step_loss = tf.nn.softmax_cross_entropy_with_logits(labels, predicts)
    step_loss = step_loss * (1 - padding_flag[:, t]) # only count loss that are not padding characters
    
    loss_sum = loss_sum + np.sum(step_loss)

char_count = np.sum(1 - padding_flag)
per_char_loss = loss_sum / char_count

print('The total number of chars in the test set is ', char_count)

print('The per-char-loss is %.3f' % per_char_loss)


The total of  1131 non-ascii chars are removed 

Number of test instances: 40000
Evaluating the model ...
The total number of chars in the test set is  2739550
The per-char-loss is 3.130


### Use the model to generate sentences

Now we can use the trained model to generate text with a starting string. The naive model just predict frequent characters in the text, so there is no meaningful generation yet. See what you get from your models.

In [105]:
def generate_text(model, start_string, str_voc):
    """ Generate random text from a starting string. The code is modified from this 
    [example](https://www.tensorflow.org/tutorials/text/text_generation)"""

    # Number of characters to generate
    num_generate = 100 - len(start_string)

    # Converting our start string to numbers (vectorizing)
    input_eval = np.array([str_voc.index(s) for s in start_string])
    input_eval = np.reshape(input_eval, [1, -1, 1])

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = predictions[0]

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions[None, :], num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.reshape([predicted_id], [1, 1, 1])

        text_generated.append(str_voc[predicted_id])

    return (start_string + ''.join(text_generated))


start_string = 'I have'
gen_sen = generate_text(model, start_string, str_voc)
gen_sen = gen_sen.split('\n')[0]

print('Starting from "' + start_string + '", the generated sentence is:')
print('"' + gen_sen + '"')

Starting from "I have", the generated sentence is:
"I have'w pta'evaoteedetyt eraolc ' oh etusorilnzduafhn.sugy rcsw lnsesismo  wgq th utrtowp :ad actth"
