In [0]:
from google.colab import files
files.upload()

In [0]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle datasets download -d paultimothymooney/poetry

In [0]:
!unzip poetry.zip

In [0]:
!ls 

In [0]:
import numpy as np
import tensorflow as tf
import os

In [0]:
path_to_file = 'eminem.txt'

In [0]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [0]:
len(text)

In [0]:
vocab = sorted(set(text))

In [0]:
char2idx = {u:i for i,u in enumerate(vocab)}
index2char = np.array(vocab)

In [0]:
text_as_int = np.array([char2idx[c] for c in text])

In [0]:
text_as_int

In [0]:
"".join(index2char[x] for x in text_as_int)

In [0]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [0]:
for i in char_dataset.take(5):
  print(index2char[i.numpy()])

In [0]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [0]:
for item in sequences.take(5):
  print(repr("".join(index2char[item.numpy()])))

In [0]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text
  
dataset = sequences.map(split_input_target)

In [0]:
for inp, targ in dataset.take(2):
  print(inp.shape)
  print(targ.shape)
  # print(repr("".join(index2char[inp.numpy()])))
  # print(repr("".join(index2char[targ.numpy()])))
  print()

In [0]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [0]:
dataset

In [0]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [0]:
vocab_size

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
                               tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
                               tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
                               tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE
)

In [0]:
for inp, tar in dataset.take(1):
  ex_pred = model(inp)
  print(ex_pred.shape, tar.shape)

In [0]:
ex_pred[0]

In [0]:
sample_indices = tf.random.categorical(ex_pred[0], num_samples=1)

In [0]:
sample_indices = tf.squeeze(sample_indices, axis=-1).numpy()

In [0]:
sample_indices

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
ex_batch_loss = loss(tar, ex_pred)

In [0]:
ex_batch_loss.numpy().mean()

In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [0]:
EPOCHS = 10

In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

In [0]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
model.build(tf.TensorShape([1, None]))

In [0]:
start_string = "Fucking"

In [0]:
input_eval = [char2idx[s] for s in start_string]
input_eval

In [0]:
input_eval = tf.expand_dims(input_eval, 0)

In [0]:
input_eval.numpy()

In [0]:
prediction = model(input_eval)

In [0]:
prediction.shape

In [0]:
prediction.shape

In [0]:
prediction = tf.squeeze(prediction, 0)

In [0]:
prediction.shape

In [0]:
prediction_id = tf.random.categorical(prediction, num_samples=1)

In [0]:
prediction_id=prediction_id[-1, 0].numpy()

In [0]:
prediction_id

In [0]:
tf.expand_dims([prediction_id], 0)

In [0]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [0]:
print(generate_text(model, start_string=u"Fucking"))