## Infinite Jest text generation
---

I want to train a sequence model on a dataset for Infinite Jest. Then, I want to have it generate text.

### 1. Package imports

In [38]:
import os
import datetime
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
%load_ext tensorboard

### 2. Loading and preprocessing the data

In [10]:
path_to_file = "infinite_jest_text.txt"
with open(path_to_file, "r") as text_in:
    text = text_in.read()
    
vocab = sorted(set(text)) # get unique words, sort alphabetically

In [16]:
# for each character in the text, associate an integer value
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# represent the text as an array of integers
text_as_int = np.array([char2idx[c] for c in text])

In [17]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# create a Tensorflow Dataset type from the text-as-array-of-ints
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [18]:
# creates batches from this dataset, drops the last batch if too small
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [20]:
def split_input_target(chunk):
    """Takes a string 'abcde' and returns 'abcd', 'bcde'"""
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# our dataset will have batches of these types of split
# strings, because we want to predict the next character
dataset = sequences.map(split_input_target)

In [27]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

# shuffle the dataset batches
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [28]:
print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


### 3. Creating a model
- structure
- compilation


In [29]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

In [30]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 
        keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
        keras.layers.Dense(vocab_size)
    ])
    
    return model

In [31]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           34560     
_________________________________________________________________
gru (GRU)                    (64, None, 512)           1182720   
_________________________________________________________________
dense (Dense)                (64, None, 135)           69255     
Total params: 1,286,535
Trainable params: 1,286,535
Non-trainable params: 0
_________________________________________________________________


### 4. Fitting the model to the data

In [35]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [36]:
model.compile(loss=loss,
             optimizer="adam", 
             metrics=["accuracy"])

In [None]:
EPOCHS = 10
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(dataset, 
                   epochs=EPOCHS,
                   callbacks=[tensorboard_callback])

Epoch 1/10

### 5. Evaluating the model and visualising results

### 6. Hyperparameter optimisation

### 7. Sequence generation