##### Copyright 2019 The TensorFlow Authors.
##### NOTICE: heavily modified by author of this repository

# Text generation with an RNN

## Setup

this is a test "program" to make sure python works 

In [None]:
"helo worldlrdl22"

### Import TensorFlow and other libraries

In [None]:
import os
import tensorflow as tf

import numpy as np
import time

this imports the data, if you arent using google colab change the `path_to_file` variable to point wherever your comments dataset is 

In [None]:
from google.colab import drive # remove this if you arent using google colab
drive.mount("/content/drive") # remove this if you arent using google colab
path_to_file = "/content/drive/MyDrive/gdpt/comments.txt" 
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

this cell is optional, you can skip it if you want to

In [None]:
print(f'Length of text: {len(text)} characters')
print(text[:250])

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

this cell defines important variables and functions

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

you can change the `seq_length` variable if you like to experiment a lil bit 

In [None]:
seq_length = 100 
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

### this step is important
change the batch_size depending on your model size, see more info in the code comments

In [None]:
# Batch size
# if you have 1-3 levels scraped use 64
# if you have more 128 is prefered
# you are free to experiment here btw
BATCH_SIZE = 64

# i dont recommend changing this
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
vocab_size = len(ids_from_chars.get_vocabulary())

# better keep this as it is
embedding_dim = 256
rnn_units = 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x
    
model = MyModel(
vocab_size=vocab_size,
embedding_dim=embedding_dim,
rnn_units=rnn_units)

"run" the model to make sure its good to go

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
model.summary()

this cell is optional

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

### training

loss is how much the model sure of its answer

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

tf.exp(example_batch_mean_loss).numpy() # this should return value similar to vocab_size higher; if not the model is badly initialized

# config the training procedure
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

this cell configures where the checkpoints will be saved

In [None]:
checkpoint_dir = '/content/training_checkpoints' # if you arent using google colab replace this with './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

the `EPOCHS` variable is responsible for how many times the ai goes through the dataset 

right now is also a good time to mention that if youre using google colab change the runtime to gpu for faster training (optional, but recommended) 

In [None]:
EPOCHS = 10 # values between 10-30 should be good, the higher the better the model will be but also training will be slower
# if you put too much epochs it will overfit and the model will be worse quality

the cell below starts training, if you interrupt it at any point the model may become corrupted

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

### wait until the model trains before continuing

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    predicted_logits = predicted_logits + self.prediction_mask
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)
    predicted_chars = self.chars_from_ids(predicted_ids)
    return predicted_chars, states

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

this cell actually runs the model

In [None]:
start = time.time()
states = None
next_char = tf.constant(['\n'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

you can pass multiple strings to the `next_char` variable, if you do it will generate text faster 

In [None]:
start = time.time()
states = None
next_char = tf.constant(['\n', '\n', '\n', '\n', '\n'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

save the model so you can use it anytime without having to run the training again

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
states = None
next_char = tf.constant(['\n'])
result = [next_char]

for n in range(500):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

put the model into an archive for safekeeping

In [None]:
import shutil
shutil.make_archive("one_step", 'zip', "/content/one_step") # if not on google colab change /content/one_step to ./one_step

## congrats, you now should have a working text model ready for use

the end btw