In [1]:
import tensorflow as tf
import numpy as np
import collections
import os

# Import data

In [2]:
def get_data(data_path=None):
    """Load raw data from data directory "data_path".
    Reads text file, converts strings to integer ids
    Args:
    data_path: string path to the directory
    Returns:
    tuple (raw_data, vocabulary)
    """
  
    data = list(open(data_path, "r").read())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    chars, _ = list(zip(*count_pairs))
    char_to_id = {c:i for i, c in enumerate(chars)}
    id_to_char = {i:c for i, c in enumerate(chars)}

    data_in_ids = [char_to_id[char] for char in data]
    return data, data_in_ids, char_to_id, id_to_char

In [4]:
raw_data_chars, raw_data_ids, char_to_id, id_to_char = get_data("bible_1000.txt")
n_chars = len(char_to_id)
print(n_chars)

60


# Prepare training data

### Create examples

In [5]:
# the length of a sequence of an example's input or response
time_steps = 100

# create tf.Dataset object
dataset = tf.data.Dataset.from_tensor_slices(raw_data_ids)

# create examples
# NOTE: the "batch" defined here is one example (batch of characters) instead of batch of examples
examples = dataset.batch(time_steps+1, drop_remainder=True)

### Map data to inputs and responses

In [6]:
def input_and_response(example):
    example_input = example[:-1]
    example_response = example[1:]
    return example_input, example_response

mapped_dataset = examples.map(input_and_response)

### Shuffle and create batches

In [7]:
BATCH_SIZE = 32
BUFFER_SIZE = 10000
shuffled_dataset = mapped_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Build model

### Parameters

In [8]:
# dimension of embedding layer
emb_size = 256

# number of rnn units
rnn_units = 1024

# max number of iterations
iterations = 500

# learning rate
learning_rate = 0.1

### Set up RNN model

In [9]:
def build_model(n_chars, emb_size, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            n_chars,
            emb_size, 
            batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(
            rnn_units,
            return_sequences=True,
            stateful=True, 
            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(n_chars)
    ])
    return model

model = build_model(
    n_chars=n_chars, 
    emb_size=emb_size,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

### Define loss and optimizer

In [10]:
def loss(responses, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(responses, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Train model

### Configure checkpoints

In [11]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Train model

In [12]:
EPOCHS=1

history = model.fit(shuffled_dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, None, 256)           15360     
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (32, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (32, None, 60)            61500     
Total params: 5,323,836
Trainable params: 5,323,836
Non-trainable params: 0
_________________________________________________________________


# Generate new text

### Rebulid model with batch size = 1

In [13]:
model = build_model(
    n_chars=n_chars, 
    emb_size=emb_size,
    rnn_units=rnn_units,
    batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            15360     
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 60)             61500     
Total params: 5,323,836
Trainable params: 5,323,836
Non-trainable params: 0
_________________________________________________________________


### Prepare seed text

In [14]:
seed_chars, _, _, _ = get_data("bible_seed.txt")
seed_chars = seed_chars[:time_steps]
seed_sentence = "".join(seed_chars)
print(seed_sentence)

Eliel, and Obed, and Jasiel the Mesobaite. 
Now these are they that came to David to Ziklag, while h


### Define writer

In [15]:
def writer(model, seed_text, written_len, temperature):
    
    # convert seed text to id list
    input_ids = tf.expand_dims([char_to_id[c] for c in seed_text], 0)
    
    # storage for written text
    written_text = []
    
    model.reset_states()
    for k in range(written_len):
        pred = tf.squeeze(model(input_ids), 0) / temperature

        # predict the last id returned by the model
        pred_id = tf.random.categorical(pred, num_samples=1)[-1, 0].numpy()

        # pass predicted ids as the input of the next prediction
        input_ids = tf.expand_dims([pred_id], 0)
        
        written_text.append(id_to_char[pred_id])
        
    return (seed_text + "".join(written_text))

In [16]:
writer(model, "Bible", 100, temperature=1.0)

'BibleLlMdclnpAanr e  e,wbo Cn aandpeauweia , JIwe tR p,D a. nld t eeontotthuAnc s thi twf,thRrh  famalh  '