<a href="https://colab.research.google.com/github/xarvel/DataScience/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
import tensorflow as tf

import numpy as np
import os
import time
from tqdm import tqdm

print(tf.__version__)

2.14.0


In [108]:
path_to_file = tf.keras.utils.get_file('bible.txt', 'https://drive.google.com/uc?export=download&id=1-4rl2K0zr5zj3Lj8tzVOEMF152gtApVP')

In [109]:
#@title utils

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

def text_stats(text):
  sample_size = 250
  print(f'Sample {sample_size} characters:')
  print('-' * 80)
  start = 150
  # Take a look at the first 250 characters in text
  print(text[start:start + sample_size])
  print('-' * 80)
  # length of text is the number of characters in it
  print(f'Length of text: {len(text)} characters')
  # The unique characters in the file
  vocab = sorted(set(text))
  print(f'{len(vocab)} unique characters')

  return vocab

In [110]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
vocab = text_stats(text)

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab),
    mask_token=None
)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),
    invert=True,
    mask_token=None
)

Sample 250 characters:
--------------------------------------------------------------------------------
о и землю.

2 Земля же была безвидна и пуста, и тьма над бездною, и Дух Божий носился над водою.

3 И сказал Бог: да будет свет. И стал свет.

4 И увидел Бог свет, что он хорош, и отделил Бог свет от тьмы.

5 И назвал Бог свет днем, а тьму ночью. И б
--------------------------------------------------------------------------------
Length of text: 4196242 characters
92 unique characters


In [111]:
#@title CONFIG

SEQUENCE_LENGTH = 100

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

# The embedding dimension
EMBEDDING_DIMENTION = 256

# Number of RNN units
RNN_UNITS = 1024

EPOCHS = 10

# Length of the vocabulary in StringLookup Layer
VOCAB_SIZE = len(ids_from_chars.get_vocabulary())

# Directory where the checkpoints will be saved
CHECKPOINT_DIR = './training_checkpoints'
# Name of the checkpoint files
CHECKPOINT_PREFIX = os.path.join(CHECKPOINT_DIR, "ckpt_{epoch}")

SEED = 1

CHECKPOINT_INTERVAL = 5

optimizer = tf.keras.optimizers.Adam()
loss_func = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [112]:
tf.random.set_seed(SEED)

In [113]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [114]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
sequences = ids_dataset.batch(SEQUENCE_LENGTH + 1, drop_remainder=True)


In [115]:
#@title DATASET


dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
STEPS_PER_EPOCH = len(dataset)
dataset = dataset.repeat()
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

print(STEPS_PER_EPOCH)

649


In [116]:
class RNNModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)

    self.embedding = tf.keras.layers.Embedding(
        vocab_size,
        embedding_dim
    )
    self.gru = tf.keras.layers.GRU(
        rnn_units,
        return_sequences=True,
        return_state=True
    )
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x


class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[ -float('inf') ] * len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]
    )
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(
        inputs=input_ids,
        states=states,
        return_state=True
    )
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits / self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [117]:
model = RNNModel(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIMENTION,
    rnn_units=RNN_UNITS,
)

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [118]:
model.build((SEQUENCE_LENGTH, VOCAB_SIZE))
model.summary()

Model: "rnn_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     multiple                  23808     
                                                                 
 gru_5 (GRU)                 multiple                  3938304   
                                                                 
 dense_5 (Dense)             multiple                  95325     
                                                                 
Total params: 4057437 (15.48 MB)
Trainable params: 4057437 (15.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [119]:
def predict_text(start_text, size):
  states = None
  next_char = tf.constant([start_text])
  result = [next_char]

  for n in range(size):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)

  return result[0].numpy().decode('utf-8')

def sample_text():
  start = time.time()
  print('Sample:')
  print('-' * 80)
  print(predict_text('И сказал Господь:', 40))
  print('-' * 80)
  end = time.time()
  print('\nRun time:', end - start)

@tf.function
def train_step(inputs):
  inputs, labels = inputs
  with tf.GradientTape() as tape:
    predictions = model(inputs, training=True)
    loss = loss_func(labels, predictions)

  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return {'loss': loss}

In [120]:
def train(dataset):
  mean = tf.metrics.Mean()
  train_iterator = iter(dataset)

  for epoch in range(1, EPOCHS + 1):
      start = time.time()

      print('Epoch: {}/{}'.format(epoch, EPOCHS))
      pbar = tqdm(range(STEPS_PER_EPOCH))

      mean.reset_states()
      for step in pbar:
        inp, target = next(train_iterator)
        logs = train_step([inp, target])
        mean.update_state(logs['loss'])
        pbar.set_postfix({
          'loss': round(float(logs['loss']), 4)
        })
        pbar.set_description("Current step %s" % step)

      # saving (checkpoint) the model every 5 epochs
      if epoch % CHECKPOINT_INTERVAL == 0:
          model.save_weights(CHECKPOINT_PREFIX.format(epoch=epoch))

      sample_text()

      print('Loss: {}'.format(round(float(mean.result().numpy()), 4)))
      print('Time for epoch {} is {} sec'.format(epoch, time.time()-start))

In [121]:
train(dataset)

Epoch: 1/10


Current step 648: 100%|██████████| 649/649 [00:39<00:00, 16.26it/s, loss=1.59]


--------------------------------------------------------------------------------
И сказал Господь: опесь для устровал о весь ее.

4 Ствана
--------------------------------------------------------------------------------

Run time: 1.8913862705230713
Loss: 2.1148
Time for epoch 1 is 41.81899666786194 sec
Epoch: 2/10


Current step 648: 100%|██████████| 649/649 [00:36<00:00, 17.73it/s, loss=1.44]


--------------------------------------------------------------------------------
И сказал Господь: зверя тебе, то свидетельствовали на Гос
--------------------------------------------------------------------------------

Run time: 0.1816551685333252
Loss: 1.4641
Time for epoch 2 is 36.795313119888306 sec
Epoch: 3/10


Current step 648: 100%|██████████| 649/649 [00:39<00:00, 16.30it/s, loss=1.3]


--------------------------------------------------------------------------------
И сказал Господь: вот, выше всей землею и крестала, доста
--------------------------------------------------------------------------------

Run time: 0.19825172424316406
Loss: 1.3226
Time for epoch 3 is 40.012763023376465 sec
Epoch: 4/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.70it/s, loss=1.25]


--------------------------------------------------------------------------------
И сказал Господь: если будешь всеми умывальникум пешед че
--------------------------------------------------------------------------------

Run time: 0.20275044441223145
Loss: 1.2507
Time for epoch 4 is 39.08648228645325 sec
Epoch: 5/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.86it/s, loss=1.22]


--------------------------------------------------------------------------------
И сказал Господь: знаете, что Тебе два слепых.

13 После 
--------------------------------------------------------------------------------

Run time: 0.24708104133605957
Loss: 1.2007
Time for epoch 5 is 38.96610689163208 sec
Epoch: 6/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.75it/s, loss=1.18]


--------------------------------------------------------------------------------
И сказал Господь: вижу ли нужды нас на устах, не пойдем в
--------------------------------------------------------------------------------

Run time: 0.10506129264831543
Loss: 1.1606
Time for epoch 6 is 38.86591339111328 sec
Epoch: 7/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.80it/s, loss=1.14]


--------------------------------------------------------------------------------
И сказал Господь: не терпи со Мною ярость и на отвергнить
--------------------------------------------------------------------------------

Run time: 0.11052989959716797
Loss: 1.126
Time for epoch 7 is 38.74115061759949 sec
Epoch: 8/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.68it/s, loss=1.11]


--------------------------------------------------------------------------------
И сказал Господь: не войду с тобою в тот город и рабов их
--------------------------------------------------------------------------------

Run time: 0.17434358596801758
Loss: 1.0945
Time for epoch 8 is 39.09170937538147 sec
Epoch: 9/10


Current step 648: 100%|██████████| 649/649 [00:38<00:00, 16.67it/s, loss=1.12]


--------------------------------------------------------------------------------
И сказал Господь: вы слышали?

17 Приступите от вечера, а
--------------------------------------------------------------------------------

Run time: 0.22196388244628906
Loss: 1.0672
Time for epoch 9 is 39.17843461036682 sec
Epoch: 10/10


Current step 648: 100%|██████████| 649/649 [00:39<00:00, 16.63it/s, loss=1.06]


--------------------------------------------------------------------------------
И сказал Господь: отвержу он в Вазира и бывших тучный хле
--------------------------------------------------------------------------------

Run time: 0.11389756202697754
Loss: 1.0431
Time for epoch 10 is 39.19634771347046 sec


In [122]:
tf.saved_model.save(one_step_model, 'model')
one_step_model = tf.saved_model.load('model')



In [123]:
print(predict_text('И сказал Господь:', 40))

И сказал Господь: об язычнику, то вот, враги ее была вели
