In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import numpy as np
import pandas as pd

In [2]:
textfile = ("C:/Users/wilde/Downloads/Python_Certification/house_script.txt")
data = open(textfile, 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(textfile)))

Length of text: 62 characters


In [3]:
# Take a look at the first 250 characters in text
print(data[:250])

[Rebecca riding bus and then running into a school. Meets up with Melanie] Melanie: Why are you late? Rebecca: You’re not going to like the answer. Melanie: I already know the answer. Rebecca: I missed the bus. Melanie: I don’t doubt it, no bus stops


In [4]:
vocab = sorted(set(data))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(data):
  return np.array([char2idx[c] for c in data])

text_as_int = text_to_int(data)

In [5]:
# lets look at how part of our text is encoded
print("Text:", data[:15])
print("Encoded:", text_to_int(data[:15]))

Text: [Rebecca riding
Encoded: [47 40 53 50 53 51 51 49  0 66 57 52 57 62 55]


In [6]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:15]))

[Rebecca riding


In [7]:
seq_length = 100  # length of sequence for a training example
examples_per_epoch = len(data)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [8]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [9]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

In [10]:
for x, y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
[Rebecca riding bus and then running into a school. Meets up with Melanie] Melanie: Why are you late

OUTPUT
Rebecca riding bus and then running into a school. Meets up with Melanie] Melanie: Why are you late?


EXAMPLE

INPUT
 Rebecca: You’re not going to like the answer. Melanie: I already know the answer. Rebecca: I missed

OUTPUT
Rebecca: You’re not going to like the answer. Melanie: I already know the answer. Rebecca: I missed 


In [11]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data1 = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           20736     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 81)            83025     
Total params: 5,350,737
Trainable params: 5,350,737
Non-trainable params: 0
_________________________________________________________________


In [13]:
for input_example_batch, target_example_batch in data1.take(1):
  example_batch_predictions = model(input_example_batch)  # ask our model for a prediction on our first batch of training data (64 entries)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")  # print out the output shape

(64, 100, 81) # (batch_size, sequence_length, vocab_size)


In [14]:
# we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[-2.23537069e-03  4.67018457e-03 -4.21498576e-03 ... -2.91747274e-04
    8.06665979e-04  2.68795236e-04]
  [ 1.74955465e-04  2.34455056e-03 -4.66846512e-04 ... -4.46083210e-03
    2.12008599e-04 -3.38360062e-03]
  [ 4.63488884e-03  6.24307198e-04 -1.91560714e-04 ...  5.23102842e-03
   -4.48480132e-05 -3.85127449e-03]
  ...
  [ 3.72150471e-03 -7.94675667e-03  1.52940745e-03 ... -3.52413696e-03
   -3.53572192e-03  6.90409914e-04]
  [ 2.14113574e-03 -7.21795764e-03  9.16266628e-03 ... -4.24564490e-03
   -1.02019282e-02  4.81819734e-04]
  [ 5.73389884e-03  7.41083757e-04 -2.34509353e-04 ... -2.30242871e-03
   -1.28783621e-02  6.38580509e-03]]

 [[ 3.74242919e-03 -4.16364055e-03  2.26221071e-03 ... -4.85948432e-04
    9.38715413e-04  2.32780236e-03]
  [ 4.41983901e-03 -1.00536030e-02  1.53552811e-03 ... -6.79928996e-03
    5.62956417e-03 -1.67085533e-03]
  [ 1.89662655e-03 -6.68851426e-03 -4.71756933e-03 ... -3.83423106e-03
    5.15405554e-03 -1.22145680e-03]
  ...
  [ 1.422

In [15]:
# lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)
# notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

100
tf.Tensor(
[[-2.2353707e-03  4.6701846e-03 -4.2149858e-03 ... -2.9174727e-04
   8.0666598e-04  2.6879524e-04]
 [ 1.7495546e-04  2.3445506e-03 -4.6684651e-04 ... -4.4608321e-03
   2.1200860e-04 -3.3836006e-03]
 [ 4.6348888e-03  6.2430720e-04 -1.9156071e-04 ...  5.2310284e-03
  -4.4848013e-05 -3.8512745e-03]
 ...
 [ 3.7215047e-03 -7.9467567e-03  1.5294075e-03 ... -3.5241370e-03
  -3.5357219e-03  6.9040991e-04]
 [ 2.1411357e-03 -7.2179576e-03  9.1626663e-03 ... -4.2456449e-03
  -1.0201928e-02  4.8181973e-04]
 [ 5.7338988e-03  7.4108376e-04 -2.3450935e-04 ... -2.3024287e-03
  -1.2878362e-02  6.3858051e-03]], shape=(100, 81), dtype=float32)


In [16]:
# and finally we'll look at a prediction at the first timestep
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
# and of course its 65 values representing the probabillity of each character occuring next

81
tf.Tensor(
[-2.2353707e-03  4.6701846e-03 -4.2149858e-03 -1.6093720e-05
 -4.2813428e-04 -4.9937870e-03 -2.8394896e-03 -4.9471506e-03
 -2.8028712e-04 -9.6414331e-04 -3.9227563e-03 -2.5933625e-03
 -1.1733677e-03 -1.3221469e-03 -1.6474854e-03 -9.7166549e-04
  1.3041871e-03 -1.4191106e-04  1.4095142e-04  3.5947065e-03
 -2.9467803e-04  2.5010030e-03 -1.1661400e-03  7.8250244e-03
 -3.3105249e-03  3.0399724e-03  1.6730226e-04 -1.6808566e-03
  5.0318446e-03 -3.3447257e-04 -1.9083279e-03 -5.0799213e-03
  3.2489426e-03  2.8114188e-03 -6.2018437e-03  2.2074319e-03
 -3.8723093e-03  3.4873085e-03  1.1366002e-03 -2.1958142e-03
 -1.9135608e-03  4.6272529e-03  2.4673515e-03 -1.1462183e-03
 -1.3704087e-03  1.6071012e-03 -7.2384824e-04 -1.7602514e-03
 -8.9801662e-03 -7.6061510e-03 -1.5912405e-03 -4.6333945e-03
  2.0010490e-03 -1.4097665e-03  1.5476961e-03 -1.3545397e-03
  1.0124899e-03  9.4640302e-04 -2.4461865e-03  2.0117261e-03
 -1.0872702e-04 -1.1217548e-03  2.6374483e-03  5.6375028e-03
 -4.269133

In [17]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabillity)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  # and this is what the model predicted for training sequence 1

'kw1;aeqP][H;}5}Uj9y?Cnx]qx}CFT?e,gEE,5’zh-T;1cUCLxmr*”dIO‘$J$%NTi.R:Tdh?DV8g%”W2L;,o3F6WVv5wTU7w($7]'

In [18]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [19]:
model.compile(optimizer='adam', loss=loss)

In [20]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [21]:
history = model.fit(data1, epochs=50, callbacks=[checkpoint_callback]) #fits the model

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [22]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [23]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) # creating checkpoints 
model.build(tf.TensorShape([1, None]))

In [24]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
    
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [30]:
inp = input("Type a starting string: ") # Starting string for the model
print(generate_text(model, inp)) # prints the predicted text

Type a starting string:  House:


House: You have! House: Bven a wormill. soured I canese twick hove has wakes it’s hine! House: She’s not batinad, agy the maspertion the erpapitt die.. [Cut oo a tum” ofen the frodiman to fif jom. gitter is you’re gring and atages. Foreman: I’m pert if bett digntir. Cameron: Yeah, he’s about mide some gonnateres] [Cut to cofl. Dreatiny. House: There’s not liago... House: No, brobation. [House just like you vesy and gotse think I fach the is andengerondan a mory mind. Wimson: Cuddy: Orase you’ve do. It’s o tuld swet if the exis touss to gurarie am probame. Ferek wese goond no. [Cut to House alleigots and peasepat.]] House: No, you mayt and megut stiof staiclled of the insented cause thin must evon into to hen ag, iln’t like you candias into betso hoppliwe day. “Couthing! House liging to mase anyt
