In [1]:
# import
import tensorflow as tf
import numpy as np
import os

In [2]:
path_to_file = "Data/Friends_Transcript.txt"

In [3]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of Character {}'.format(len(text)))

Length of Character 4965729


In [4]:
print(text[:250])

THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)
Written by: Marta Kauffman & David Crane
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]
Monica: There's nothing to tell! He's just some guy I work with!
J


In [5]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

95 unique characters


In [6]:
# Create a mapping from characters to numbers and vice versa
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
print("{} is mapped to {}".format(text[:10], text_as_int[:10]))

THE ONE WH is mapped to [54 42 39  2 49 48 39  2 57 42]


In [8]:
# Maximum sentence we are inputing to the RNN
seq_length =100
examples_per_epoch = len(text)//(seq_length+1)
print(examples_per_epoch)

49165


In [9]:
# Creating dataset
# from_tensor_slices is like creating a generator for our dataset and is suitable for handling
# large datasets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [10]:
# .take is like iloc in pandas
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

T
H
E
 
O


In [11]:
# .batch converts into batches of fixed size
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & D'
"avid Crane\r\n[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\r\nMonica: There's not"
"hing to tell! He's just some guy I work with!\r\nJoey: C'mon, you're going out with the guy! There's go"
'tta be something wrong with him!\r\nChandler: All right Joey, be nice. So does he have a hump? A hump a'
"nd a hairpiece?\r\nPhoebe: Wait, does he eat chalk?\r\n(They all stare, bemused.)\r\nPhoebe: Just, 'cause, "


In [12]:
def split_input_target(chunk):
    """
    This function generate input and target text from the given text.
    Input text does not contain last part and target doesnot contain first character
    """
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

In [13]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & '
Target data: 'HE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & D'


In [14]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 54 ('T')
  expected output: 42 ('H')
Step    1
  input: 42 ('H')
  expected output: 39 ('E')
Step    2
  input: 39 ('E')
  expected output: 2 (' ')
Step    3
  input: 2 (' ')
  expected output: 49 ('O')
Step    4
  input: 49 ('O')
  expected output: 48 ('N')


In [15]:
# Now we need to shuffle the data and pack into batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [16]:
#Constants for model
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

#TODO: Use LSTM Layer also

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful= True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [32]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [33]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 95) # (batch_size, sequence_length, vocab_size)


In [34]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           24320     
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_2 (Dense)              (64, None, 95)            97375     
Total params: 4,059,999
Trainable params: 4,059,999
Non-trainable params: 0
_________________________________________________________________


In [21]:
#Here we are chosing the next character randomly based on its probablity
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
#idk what this line does
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices


array([78, 27, 78, 29, 85, 70, 19, 39, 78, 17,  9,  9,  4, 82, 75, 84, 32,
       55, 93, 16, 49, 58, 86, 61, 37, 33, 88, 35, 17, 86,  6,  1, 91, 31,
       27, 55, 54, 93, 27, 68, 42, 10, 93, 89, 70, 70, 66, 11, 25, 82, 61,
       56, 42, 77, 69, 37, 88, 23, 72, 86, 15, 48, 29, 47, 72,  3, 10, 57,
       83, 54, 86, 19, 44, 82, 76,  4,  9,  4, 35, 80, 18, 46,  1, 19, 48,
        1, 62,  4, 35, 13, 94, 15, 84,  6, 89, 40, 36,  1, 48, 34],
      dtype=int64)

In [22]:
#Decoding what this means
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'I mean (he sprays Phoebe with crumbs) \r\nPhoebe: Okay, I asked for the news, not the weather.\r\nMonica'

Next Char Predictions: 
 'm9m;te1Em/\'\'"qjs>U|.OXu[C?wA/u$\rz=9UT|9cH(|xeea)7q[VHldCw5gu-N;Mg!(WrTu1Jqk"\'"Ao0L\r1N\r]"A+}-s$xFB\rN@'


In [23]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [35]:
model.compile(optimizer='adam', loss=loss)

In [36]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [39]:
model.load_weights('./training_checkpoints/ckpt_18')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x243b8a8b7b8>

In [40]:
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: 

In [41]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_5'

In [42]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights('./training_checkpoints/ckpt_17')
model.build(tf.TensorShape([1, None]))

In [43]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            24320     
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_3 (Dense)              (1, None, 95)             97375     
Total params: 4,059,999
Trainable params: 4,059,999
Non-trainable params: 0
_________________________________________________________________


In [52]:
def generate_text(model, start_string):
  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []
  temperature = 1

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [54]:
print(generate_text(model, start_string=u"Monica:"))

Monica: Oh. It's still abuived!
Rachel: Really? Thats in like a flaked friend?
Ross: Oh thats ride! I will wait. I took facific!
[Scene: Monica's apartmen, two messages when it's all condo tomorrow,   he is just in the cat. The patterd of the ligst iant with you?
Rachel: Oh, I'm sorry, uh, are you   mistake. Phoebe get their favorites at the chair from the doorwantosters)
Greg Mesecrechel Geller, because we dont have a list to mean anything, Id have more, protective more of the message is sort of saw your voy, right?
Monica: I wanna work! Get overat! I gotta come off! I am sorry, you and Ross ya!!
Joey: OOk, no, no! (to Emma)

[Scene: Coffee place, Rachel puts his room and she walks over to the window in a perfectly baby?
(Rubs now to same appanics.)
Monica: You dropped it?! He played when-she grabs it, and throws water balloons and starts dancing.]
Donny: Y'know, it was a moth.
Joey: Oh my God! I cant believe Ross get to the paper.. Humy a minute?
Joey: Yes, I did. Could

In [55]:
model.save('Model_1_GRU.h5')