# 1. Project two: character-level language modeling in TensorFlow
## 1. Preprocessing the dataset
### 1. Read the dataset as plain text
- We also remove portions from the beginning and the end, since they are not useful

In [8]:
import numpy as np

## Reading and processing text
with open('data/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
print(start_indx, end_indx)

text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

567 1112917
Total Length: 1112350
Unique Characters: 80


### 2. Build the dictionary to map characters to integers
- Reverse mapping is done via indexing a NumPy array

In [9]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape:  (1112350,)
THE MYSTERIOUS       == Encoding ==>  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]  == Reverse  ==>  ISLAND


### 3. Create a TensorFlow dataset from this array

In [10]:
import tensorflow as tf

ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


### 4. Separate the input and target sequences accordingly

In [12]:
seq_length = 40
chunk_size = seq_length + 1

ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True)

## define the function for splitting x & y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

## inspection:
for example in ds_sequences.take(2):
    print(' Input (x):', repr(''.join(char_array[example[0].numpy()])))
    print('Target (y):', repr(''.join(char_array[example[1].numpy()])))
    print()

Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

 Input (x): ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
Target (y): 'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



### 5. Divide the dataset into mini-batches

In [13]:
# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 10000

tf.random.set_seed(1)
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)# drop_remainder=True)

ds

<BatchDataset shapes: ((None, 40), (None, 40)), types: (tf.int32, tf.int32)>

## 2. Building a character-level RNN model
### 1. Write a function that defines an RNN model using the Keras Sequential class, specify traininng parameters and obtain an RNN model

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(
            rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

## Set the training parameters
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512

tf.random.set_seed(1)

model = build_model(
    vocab_size = charset_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         20480     
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


### 2. Train the model

In [15]:
model.compile(
    optimizer='adam', 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True
    ))

model.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x220409d1d08>

## 3. Evaluation phase: generating new text passages
### 1. Drawing random samples from a categorical distribution
- If we simply select the element with the highest logit value, the model will always produce the same text

In [16]:
tf.random.set_seed(1)

logits = [[1.0, 1.0, 1.0]]
print('Probabilities:', tf.math.softmax(logits).numpy()[0])

samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
array([[0, 0, 1, 2, 0, 0, 0, 0, 1, 0]], dtype=int64)


In [17]:
tf.random.set_seed(1)

logits = [[1.0, 1.0, 3.0]]
print('Probabilities:', tf.math.softmax(logits).numpy()[0])

samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
array([[2, 0, 2, 2, 2, 0, 1, 2, 2, 0]], dtype=int64)


### 2. Define the sampling function and generate some new text
- The sampling function takes a string as an input, generates a new sequence of characters with a new predicted character in the end of it, then appends this new character to the end of the generated text string, and finally goes back to predicting the next character, but now using the new end of the text string of the input string length instead

In [18]:
def sample(model, starting_str, 
           len_generated_text=500, 
           max_input_length=40,
           scale_factor=1.0):
    encoded_input = [char2int[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input)
        logits = tf.squeeze(logits, 0)

        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(
            scaled_logits, num_samples=1)
        
        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()    

        generated_str += str(char_array[new_char_indx])
        
        new_char_indx = tf.expand_dims([new_char_indx], 0)
        encoded_input = tf.concat(
            [encoded_input, new_char_indx],
            axis=1)
        encoded_input = encoded_input[:, -max_input_length:]

    return generated_str

tf.random.set_seed(1)
print(sample(model, starting_str='The island'))

The island is or circies or of stone. These well of the radiant
proved with turn to the highest bears questional.

There was no occasion for three day and stood with rocks before the pontrance was pursued from an immense elamners,
who had fallen into the island and might be gone against a sole harfless finished quait and all that and dry wanting together
up on the rocks, whose six filled on this
beands of Red Creek Glycerine, and the sunning knelled to the beach, mulphy open. The lava made not all
three h


## 4. Predictability vs. randomness
- Scaling the logits computed by the RNN model before passing them to tf.random.categorical() allows us to control the predictability of the generated samples (that is, generating text following the learned patterns from the training text versus adding more randomness)

### 1. Demonstration of scaling

In [19]:
logits = np.array([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', tf.math.softmax(logits).numpy()[0])

print('Probabilities after scaling with 0.5:', tf.math.softmax(0.5*logits).numpy()[0])

print('Probabilities after scaling with 0.1:', tf.math.softmax(0.1*logits).numpy()[0])

Probabilities before scaling:         [0.10650698 0.10650698 0.78698604]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611688]
Probabilities after scaling with 0.1: [0.31042377 0.31042377 0.37915245]


### 2. Generating texts with different scaling factors

In [20]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island', 
             scale_factor=2.0))

The island was so as to discover the poultry-yard, and the heat was continued.

The colonists had had the convicts had not less than extreme surprise, and the truth had been discovered the colonists.

“The will not be resistance the internal signal to the sea, and the convicts were already painful to the corral and the lad served as if the convicts would
produce a stern to the shore.

The first bark of the Mercy, and the flood of the other end of the water.

The reporter and his companions had already sti


In [21]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island', 
             scale_factor=0.5))

The island
happilid a drems parts,
withlessly a? Taarif-sadcepe or valution. “Pellowd Cyrul or a lantern, adreps Spilett. Tollow-deeq riferachmve.

Nea?--low, whyn’,
especialas ockurarigish Harding,” observed
dayend, tiorde-flammed. Lef Grant’ somhorsity is heard. The cossition immediftwappie-clemescops, domphiams braThted dash
agains, who
acquierhup” re up, Harding.
 thus wishint did not quitw.

“Albordir.

On, doffully
hoper, during vessel fleard!”
returned Gbantt qunsreg. Nothingly, I
will--77

Yet?” a
