In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import numpy as np
import os
import time

In [3]:
path_to_file = './SMSSpamCollection.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(text)))

Length of text: 477203 characters


In [4]:
print(text[:250])

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive


In [5]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

118 unique characters


In [15]:
char2idx = {char:idx for idx, char in enumerate(vocab)}
idx2char = np.array(vocab)
int_text = np.array([char2idx[char] for char in text])


In [16]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)
char_dataset = tf.data.Dataset.from_tensor_slices(int_text)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

h
a
m
	
G


In [17]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there go'
't amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final'
" tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075o"
"ver18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, h"
"e lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'"


In [18]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [19]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g'
Target data: 'am\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there go'


In [20]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 73 ('h')
  expected output: 66 ('a')
Step    1
  input: 66 ('a')
  expected output: 78 ('m')
Step    2
  input: 78 ('m')
  expected output: 0 ('\t')
Step    3
  input: 0 ('\t')
  expected output: 41 ('G')
Step    4
  input: 41 ('G')
  expected output: 80 ('o')


In [21]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [22]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [1]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [24]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [25]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predic   tions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 118) # (batch_size, sequence_length, vocab_size)


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           30208     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 118)           120950    
Total params: 4,089,462
Trainable params: 4,089,462
Non-trainable params: 0
_________________________________________________________________


In [27]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [28]:
sampled_indices

array([  1,  55,  48,  79,   7,  77,  47,  10,  40,  49,  87,  59,  64,
        87,  40,  51, 101,  59, 115,  72, 114,  18,  60,  96,  85,  39,
        24,  44, 104,  77,  66,  64, 116,  78,   4,  31,  18,  95,  21,
        25,  23,  51, 111, 109,  28,   6,  70,  56,  70,  82,  49,  85,
        81,  32, 113,  61,  93,   4,  26,  17,  82,  29,  73, 108,  71,
         2,  12,  97,  59,  72,  25, 112,  37,  44,   0,  53,  13,  24,
        11,  62,  60, 112,  69,  82,  86,  44,  62,  38,   8,  83,  98,
       101,  65,  46,  63,  25,  96,  45,  15,  50])

In [29]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 "spam\tCollect your VALENTINE'S weekend to PARIS inc Flight & Hotel + £200 Prize guaranteed! Text: PAR"

Next Char Predictions: 
 '\nUNn%lM(FOvY^vFQ»Y┾g…0Z\x93tE6Jèla^〨m"=0\x92375Q‘–:$eVeqOtp>“[~"8/q;hüf *\x94Yg7’CJ\tS+6)\\Z’dquJ\\D&r\x96»_L]7\x93K-P'


In [30]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 118)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.7710176


In [31]:
model.compile(optimizer='adam', loss=loss)

In [32]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [33]:
EPOCHS=10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Train for 73 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [35]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            30208     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 118)            120950    
Total params: 4,089,462
Trainable params: 4,089,462
Non-trainable params: 0
_________________________________________________________________


In [38]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

print(generate_text(model, start_string=u"ham     "))

ham     with we gettin caris, my po I'll be
ham	I jook tosts dy at keep late has how it anso.
ham	Well I was to i'm out and af offe ...
ham	Cood getting me aft fant i looon?. Who you having down them that ? U can i think of the weeks ok lack, they hows to anytay ok a too patt more . I will take  pint so then cam. I get back?
spam	Hey aftoouna sodry out. Still this suz Holiday or ifong day. At you so much i've remam	shopai it there is you something is ur loah!. My call me... U chenight today. COHAPSONT LOVE 0808684812787 ESCANE! D WHAN ferm) knows i have? And have at already the good here await college
ham	Suve YOU--BIOU. I d onling everything 8007 =fue chat.ha Vadiep) WC lls 89
ham	He gas same the puying my satch sounds pifing him, to day that Diy u life to noing and they dnow...
ham	U points. Sad a gaol back team
ham	Sant that you guys me 4 now.
spam	Hearlo! Call 090600191150 nward DI
spam	XHIs to tell there is a $950).. Call 09051202605 
ham	What argict you to pale at from 8150min't 