In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time

In [2]:
poems = pd.read_csv('/Users/tiagoornelas/Documents/Ironhack/Projects/Final project/scraped_rnn.csv')

poems = poems.dropna()

In [3]:
# finding unique values per poem 

def unique_values (row):

    return sorted(set(row))

poems['unique'] = poems['Poem'].apply(unique_values)

In [4]:
# creating set of unique characters 

vocab_list = list(poems['unique'])

unique_list = []

for i in vocab_list:
    for x in i:
        unique_list.append(x)

unique_vocab = sorted(set(unique_list))

In [5]:
len(unique_vocab)

298

In [6]:
## vectorizing text

def char_encode (row):

    return tf.strings.unicode_split(row, input_encoding='UTF-8')

char_encode_df = poems['Poem'].apply(char_encode)


2022-08-28 13:06:10.061698: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# converting tokens to character ID's

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(unique_vocab), mask_token=None)

In [8]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [9]:
def vectors_to_char_id (row):

    return ids_from_chars(tf.strings.unicode_split(row, 'UTF-8'))

all_ids = poems['Poem'].apply(vectors_to_char_id)


all_ids_df_list = list(all_ids)
all_ids_list = []

for i in all_ids_df_list:
    for x in i:
        all_ids_list.append(x)


In [12]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids_list)

In [None]:
for ids in ids_dataset.take(60):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

In [13]:
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'D' b'o' b'g' b' ' b'b' b'o' b'n' b'e' b',' b' ' b's' b't' b'a' b'p'
 b'l' b'e' b'r' b',' b' ' b'c' b'r' b'i' b'b' b'b' b'a' b'g' b'e' b' '
 b'b' b'o' b'a' b'r' b'd' b',' b' ' b'g' b'a' b'r' b'l' b'i' b'c' b' '
 b'p' b'r' b'e' b's' b's' b' ' b' ' b' ' b' ' b' ' b' ' b'b' b'e' b'c'
 b'a' b'u' b's' b'e' b' ' b't' b'h' b'i' b's' b' ' b'w' b'i' b'n' b'd'
 b'o' b'w' b' ' b'i' b's' b' ' b'l' b'o' b'o' b's' b'e' b'\xe2\x80\x94'
 b'l' b'a' b'c' b'k' b's' b' ' b's' b'u' b'c' b't' b'i' b'o' b'n' b','
 b' ' b'l' b'a' b'c' b'k'], shape=(101,), dtype=string)


In [14]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'Dog bone, stapler, cribbage board, garlic press      because this window is loose\xe2\x80\x94lacks suction, lack'
b's grip. Bungee cord, bootstrap, dog leash, leather belt      because this window had sash cords. They'
b" frayed. They broke. Feather duster, thatch of straw, empty bottle of Elmer's glue      because this "
b'window is loud\xe2\x80\x94its hinges clack open, clack shut. Stuffed bear, baby blanket, single crib newel      '
b"because this window is split. It's dividing in two. Velvet moss, sagebrush, willow branch, robin's wi"


In [15]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [16]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'Dog bone, stapler, cribbage board, garlic press      because this window is loose\xe2\x80\x94lacks suction, lac'
Target: b'og bone, stapler, cribbage board, garlic press      because this window is loose\xe2\x80\x94lacks suction, lack'


### Creating training batches

In [1]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

NameError: name 'dataset' is not defined

## Creating model

hyperparameters

In [30]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 800

In [31]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [32]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [33]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 299) # (batch_size, sequence_length, vocab_size)


In [34]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'he Pacific Ocean watching the sea stretch past a gauze of power lines into a green horizon this summ'

Next Char Predictions:
 b'=\xc3\xbf\xc3\xa0\xe7\xbe\x8eQ\xc3\xb4\xce\xbe\xe1\xba\xa3\xc3\xb6\xe7\x8c\xaa\xc3\x86S\xc3\x89\xc2\xb4+\xce\xb7\xc3\xaclc\xce\xac\xce\xad\xe2\x94\x80:\xc3\xa1"\xc5\xbb\xce\xb3F\xe2\x80\xa2&]\xc5\xbd6X\xe1\xbb\x91\xce\xa3\xcf\x8c}\xe5\xb9\xbf\xc3\xb1*\xc4\x93S\xcf\x89\xe2\x80\x82\xe6\x97\xa6\xc3\x94\xe2\x89\x88\xc5\xafi\xe2\x94\x80T\xe2\x80\xa2\xe1\xbb\xa3\xc3\x9a\xc3\x88\xc3\x9f\xce\xad\xe2\x80\xa2\xce\xaf\xcb\x9a\xe5\xb9\xbfN\xe1\xbb\x91\xe1\xbb\x89\xe2\x80\x9d\xce\xbe\xe2\x80\x94p.W\xe7\x9b\xae~\xcf\x80\xc3\x8b\xc3\xad\xe1\xba\xa5$Az\xc3\x9ac\xe6\x9c\xaa\xcc\xa7\xcc\x84k\xc3\x88\xc3\x93\xc3\xb9\xc4\x9f\xc3\xa1m+\xc5\x93\xc4\xaby\xc4\x9f\xc4\x9b\xc2\xa3P'


## Training Model

In [35]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [36]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 299)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(5.7031765, shape=(), dtype=float32)


In [37]:
tf.exp(example_batch_mean_loss).numpy()

299.81827

compiling model

In [38]:
model.compile(optimizer='adam', loss=loss)


In [39]:
## saving checkpoints

checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [3]:
EPOCHS = 10

In [41]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [43]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [135]:
one_step_model

<__main__.OneStep at 0x1790ec940>

In [44]:
start = time.time()
states = None
next_char = tf.constant(['Love'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Love, less You kill well if you lie —                               so dream'd wrong,With grave, gazing, forbeam, and men municipl'd Sictries— Things fall apart, serameauble void.      Those branches, however care to see The God and wine, not one night of food; and the great men's long city, and the cold,          And the makes me wander nest, Meet your noses, and dearer and hair, We might art. I’m always on harvesting                Dark mother, in that purpose and two Survivors to a king's moment. Serven    is the deif night versoful after shear.                                                             on                a cold shore, a tone  standing at grappery          beaches frozen                   into the hull,    odds one of those human eye:       later that after death of philosophie used again.     Brown and the only lost interfusions    they died and go away.   Sweet nim crazed by now your difference,  both me cheer       in my hand. I think if it were fulfilled —I see 

In [45]:
tf.saved_model.save(one_step_model, 'one_step')





INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets
