# Imports

In [1]:
import numpy as np
import tensorflow as tf

In [3]:
songs_of_solomon = open(".\\data\\Songs of Solomon Corpus.txt", 'r').read()

# Exploration

In [4]:
print('Length of text: {} characters'.format(len(songs_of_solomon)))

Length of text: 13686 characters


In [5]:
print(songs_of_solomon[:250])

Chapter 1

1 The Song of songs, which is Solomon's.
Beloved
2 Let him kiss me with the kisses of his mouth
for your love is better than wine.
3 Your oils have a pleasing fragrance.
Your name is oil poured forth,
therefore the virgins love you.
4 Take


In [6]:
vocab = sorted(set(songs_of_solomon))
print('{} unique characters'.format(len(vocab)))

68 unique characters


# Preparation

In [7]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in songs_of_solomon])

In [8]:
# Show how the first 50 characters from the text are mapped to integers
print('{} ---- characters mapped to int ----> {}'.format(repr(songs_of_solomon[:50]), text_as_int[:50]))

"Chapter 1\n\n1 The Song of songs, which is Solomon's" ---- characters mapped to int ----> [19 47 40 55 59 44 57  1  7  0  0  7  1 35 47 44  1 34 54 53 46  1 54 45
  1 58 54 53 46 58  4  1 62 47 48 42 47  1 48 58  1 34 54 51 54 52 54 53
  3 58]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(songs_of_solomon) // (seq_length + 1)

# Create training examples/targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

C
h
a
p
t


In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

"Chapter 1\n\n1 The Song of songs, which is Solomon's.\nBeloved\n2 Let him kiss me with the kisses of his "
'mouth\nfor your love is better than wine.\n3 Your oils have a pleasing fragrance.\nYour name is oil pour'
'ed forth,\ntherefore the virgins love you.\n4 Take me away with you.\nLet us hurry.\nThe king has brought'
' me into his chambers.\nFriends\nWe will be glad and rejoice in you.\nWe will praise your love more than'
' wine!\nBeloved\nThey are right to love you.\n5 I am dark, but lovely,\nyou daughters of Jerusalem,\nlike '


In [11]:
# Duplicate and shift sequence to form input and text

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [12]:
for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "Chapter 1\n\n1 The Song of songs, which is Solomon's.\nBeloved\n2 Let him kiss me with the kisses of his"
Target data:  "hapter 1\n\n1 The Song of songs, which is Solomon's.\nBeloved\n2 Let him kiss me with the kisses of his "


## Create Training batches

In [13]:
BATCH_SIZE = 8

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((8, 100), (8, 100)), types: (tf.int32, tf.int32)>

In [14]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [15]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
        batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units, return_sequences=True,
                           stateful=True,
                           recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [16]:
model = build_model(vocab_size=len(vocab),
                   embedding_dim=embedding_dim,
                   rnn_units=rnn_units,
                   batch_size=BATCH_SIZE)

In [17]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_length)")

(8, 100, 68) # (batch_size, sequence_length, vocab_length)


In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (8, None, 256)            17408     
_________________________________________________________________
gru (GRU)                    (8, None, 1024)           3938304   
_________________________________________________________________
dense (Dense)                (8, None, 68)             69700     
Total params: 4,025,412
Trainable params: 4,025,412
Non-trainable params: 0
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [20]:
sampled_indices

array([28, 54, 57, 44, 17, 16, 64,  3, 49, 12, 42, 66, 22, 37, 40,  3, 28,
       25, 12,  9, 27, 32,  8, 51, 26,  4,  9, 47, 41, 39, 58, 24, 44,  2,
       38, 60, 35, 54, 51, 16, 32,  3,  7, 55, 25, 44, 43, 16, 10, 61, 41,
       55, 43, 34, 25, 24, 37, 61,  6, 34, 51, 45, 29, 41, 57, 28, 57, 33,
       65,  7, 38, 43, 13, 52, 32, 18, 10, 14, 27, 59, 35, 52, 39,  1, 42,
        7,  2, 23, 31,  2, 29, 31, 58, 34,  8, 29,  2, 26, 21, 21],
      dtype=int64)

In [21]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

Input: 
 'rh with my spice I have eaten my honeycomb with my honey I have drunk my wine with my milk.\nFriends\n'

Next Char Predictions: 
 "LoreA?y'j6c“FWa'LI63KP2lJ,3hbZsHe!YuTol?P'1pIed?4vbpdSIHWv0SlfMbrLrRz1Yd7mPB48KtTmZ c1!GO!MOsS2M!JEE"


In [22]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar loss: ", example_batch_loss.numpy().mean())

Prediction shape:  (8, 100, 68)  # (batch_size, sequence_length, vocab_size)
scalar loss:  4.219777


In [23]:
model.compile(optimizer='adam', loss=loss)

In [24]:
import os

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                        save_weights_only=True)

In [25]:
EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [31]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_50'

In [32]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [33]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            17408     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_2 (Dense)              (1, None, 68)             69700     
Total params: 4,025,412
Trainable params: 4,025,412
Non-trainable params: 0
_________________________________________________________________


# Generating Text

In [34]:
def generate_text(model, start_string):
    # Evaluate step (generating text using the learning model)
    
    # Number of characters to generate
    num_generate = 1000
    
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    # Empty string to store our results 
    text_generated = []
    
    # Low temperatures results in more predictable text
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0
    
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2char[predicted_id])
        
    return (start_string + ''.join(text_generated))

In [35]:
print(generate_text(model, start_string=u"You are "))

You are like jewels,
the work of the hands of a skillful workman.
2 Your body is like a rou tr beartimun.
5 You have ravished my heart, my sister, my bride.
I have gathered my myrngrthan made himself a carriage
of the wood of Lebenoverol.
He hear on you ouch in through the latch opening.
My heart pounded for him, but I did, and I am his.
He browses among the lilies,
4 You are beautiful, my love.
There in the day whon my gore, my beloved, lest I sould kiss you yes, and no one would despise me.
2 I would lead you, bringing you into my mother's house,
into the chamber of her who conceived me.
5 I adjure you, daughters of Jerusalem,
becude the lipto my rome.
I will take hold of her buthed,
that you not stir up, nor ano mountains,
skipping on the hills.
9 My beloved has gone dovely as Jerusalem,
awesome as an army with banners.
5 Turn away your eyes from me,
for they have over my heart was awake.
I will take hold of its fruit.”
Lety son or mashe mornimg of the to conceivedy me.
They gave yo

In [39]:
generated_text = generate_text(model, start_string=u"You are ")

In [41]:
print(generated_text)

You are beautiful.
Your eyes are doves.
Beloved
16 Behold, you are beautiful, my love, as Tirzah,
ther of the mountain of myruhalem,
that you not stir up, nor awaken love,
until it so desires.
8 The voice of my beloved's.
mo hear love,
my tente.
Ther's arm me.
11 wis head is like the pmell of Lebanon.
12 A locked up garden is my sister, my bride.
Beloved
2 Let him kiss me with the kisses of his heart.
Le is the favorite one ead,
for my conceived your mountain of myrrh,
to the hill of frime,
with me from Lebanon, my beloved, and thighom mount Gidear.
2 Your teeth are like a newly shorn flock,
which have come up from the washing,
whereon a thousand,
which If for him whom My soul loves.
I held him, and would not let him grome me.
Your hair is like a le the shee arf ofly scarcely passed.
The vines are in foroushavely.
They are right to love you.
5 I am dark, but my heart was awake.
I will take hold of its fruit.”
Let your breasts like its fruit.
8 I said, “I will climb up into the palm to 

# Comments

* The most important thing I picked up was the importance of batch size. I was working with a small corpus and using the usual batch_size=128 kept generating incoherent text.
* Also, the small size of the corpus makes the generated text a little predictable. I can see two to three word phrases repeated verbatim from the corpus trained on. 