## Infinite Jest text generation
---

I want to train a sequence model on a dataset for Infinite Jest. Then, I want to have it generate text.

### 1. Package imports

In [19]:
import os
import datetime
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


### 2. Loading and preprocessing the data

In [21]:
path_to_file = "infinite_jest_text.txt"
with open(path_to_file, "r") as text_in:
    text = text_in.read()
    
vocab = sorted(set(text)) # get unique words, sort alphabetically

In [22]:
# for each character in the text, associate an integer value
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# represent the text as an array of integers
text_as_int = np.array([char2idx[c] for c in text])

In [23]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# create a Tensorflow Dataset type from the text-as-array-of-ints
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [24]:
# creates batches from this dataset, drops the last batch if too small
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [25]:
def split_input_target(chunk):
    """Takes a string 'abcde' and returns 'abcd', 'bcde'"""
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# our dataset will have batches of these types of split
# strings, because we want to predict the next character
dataset = sequences.map(split_input_target)

In [26]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

# shuffle the dataset batches
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [27]:
print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


### 3. Creating a model
- structure
- compilation


In [28]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

In [29]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 
        keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
        keras.layers.Dense(vocab_size)
    ])
    
    return model

In [30]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           34560     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 512)           1182720   
_________________________________________________________________
dense_1 (Dense)              (64, None, 135)           69255     
Total params: 1,286,535
Trainable params: 1,286,535
Non-trainable params: 0
_________________________________________________________________


### 4. Fitting the model to the data

In [32]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [33]:
model.compile(loss=loss,
             optimizer="adam", 
             metrics=["accuracy"])

In [34]:
EPOCHS = 1
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(dataset, 
                   epochs=EPOCHS)



### 5. Evaluating the model and visualising results

In [35]:
plt(history.history["accuracy"])
plt(history.history["val_accuracy"])
plt.title("Model accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(["Training", "Validation"], loc="upper left")
plt.show()

plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Training", "Validation"], loc="upper left")
plt.show()

TypeError: 'module' object is not callable

### 6. Hyperparameter optimisation

### 7. Sequence generation