In [0]:
# import
import tensorflow as tf
import numpy as np
import os

In [3]:
# For colab 
from google.colab import files
files.upload()

KeyboardInterrupt: ignored

In [4]:
path_to_file = tf.keras.utils.get_file('Friends_Transcript.txt', 'https://raw.githubusercontent.com/uragirii/Friends-Generator/master/Data/Friends_Transcript.txt')

Downloading data from https://raw.githubusercontent.com/uragirii/Friends-Generator/master/Data/Friends_Transcript.txt


In [5]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of Character {}'.format(len(text)))

Length of Character 4899189


In [6]:
print(text[:250])

THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)
Written by: Marta Kauffman & David Crane
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]
Monica: There's nothing to tell! He's just some guy I work with!
Joey:


In [7]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))
print(vocab)

94 unique characters
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}']


In [0]:
# Create a mapping from characters to numbers and vice versa
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [9]:
print("{} is mapped to {}".format(text[:10], text_as_int[:10]))

THE ONE WH is mapped to [53 41 38  1 48 47 38  1 56 41]


In [10]:
# Maximum sentence we are inputing to the RNN
seq_length =100
examples_per_epoch = len(text)//(seq_length+1)
print(examples_per_epoch)

48506


In [0]:
# Creating dataset
# from_tensor_slices is like creating a generator for our dataset and is suitable for handling
# large datasets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [12]:
# .take is like iloc in pandas
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

T
H
E
 
O


In [13]:
# .batch converts into batches of fixed size
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\nWritten by: Marta Kauffman & Da'
"vid Crane\n[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\nMonica: There's nothin"
"g to tell! He's just some guy I work with!\nJoey: C'mon, you're going out with the guy! There's gotta "
'be something wrong with him!\nChandler: All right Joey, be nice. So does he have a hump? A hump and a '
"hairpiece?\nPhoebe: Wait, does he eat chalk?\n(They all stare, bemused.)\nPhoebe: Just, 'cause, I don't "


In [0]:
def split_input_target(chunk):
    """
    This function generate input and target text from the given text.
    Input text does not contain last part and target doesnot contain first character
    """
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

In [15]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\nWritten by: Marta Kauffman & D'
Target data: 'HE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\nWritten by: Marta Kauffman & Da'


In [16]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 53 ('T')
  expected output: 41 ('H')
Step    1
  input: 41 ('H')
  expected output: 38 ('E')
Step    2
  input: 38 ('E')
  expected output: 1 (' ')
Step    3
  input: 1 (' ')
  expected output: 48 ('O')
Step    4
  input: 48 ('O')
  expected output: 47 ('N')


In [17]:
# Now we need to shuffle the data and pack into batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
#Constants for model
vocab_size = len(vocab)
embedding_dim = 256
# Using half of rnn_units for LSTM
# Speed of training was reduced to half, so i can try 1024 units
rnn_units = 1024


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences = True, stateful= True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [21]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 94) # (batch_size, sequence_length, vocab_size)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           24064     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 94)            96350     
Total params: 5,367,390
Trainable params: 5,367,390
Non-trainable params: 0
_________________________________________________________________


In [23]:
#Here we are chosing the next character randomly based on its probablity
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
#idk what this line does
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices


array([75, 64, 29, 60, 36,  6, 16, 80, 67, 79, 85, 13, 32, 83, 11, 92, 12,
       45, 60, 45, 88,  2, 23, 29, 66, 25, 80, 35, 21, 33, 34, 83, 58, 48,
        1, 28, 27, 70, 25,  0, 36, 51, 19, 46, 22, 86, 37, 44, 53, 64, 12,
       70, 70, 44,  7, 50, 40, 29, 91, 53,  4,  0,  8, 28, 14, 71, 76, 48,
       79, 36, 75, 28, 36, 54, 34, 56, 54, 18, 40, 47,  2,  1, 23,  6, 17,
       78, 29, 90, 16, 61, 70, 85, 80, 60, 71, 32, 63,  1, 13, 93])

In [24]:
#Decoding what this means
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' all the time we have. Next on Ross...(opens up the washer) Uh-oh.\nRachel: What uh-oh?\nRoss: (not wa'

Next Char Predictions: 
 "k`<[C%/pcou,?s*|+L[Lx!6<b8pB4@AsYO ;:f8\nCR2M5vDKT`+ffK&QG<{T#\n';-glOoCk;CUAWU1GN! 6%0n<z/]fup[g?_ ,}"


In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [0]:
# model.load_weights('./training_checkpoints/ckpt_18')

In [31]:
EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50

KeyboardInterrupt: ignored

In [32]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_37'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            24064     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 94)             96350     
Total params: 5,367,390
Trainable params: 5,367,390
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model, start_string):
  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []
  temperature = 1

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [37]:
print(generate_text(model, start_string=u"Monica: I know"))

Monica: I know whats going on here! Its so heally naked Rachel and Phoebe is back at Monica.]
Monica: Hey!
Phoebe: Hey, look kids back to the gang.)
Joey: Whoa, dare game, this wand makes a biteou kinda fun and match, which can him in together) Hmmmmm.
Mike: Oh my go Not naked.
Chandler: But, the kid   ica: Huh. (turns around and throws it on the last one.) (He pulls his hands up in excitedly and glares at him.) Like that is the first time and you know who the freak liosswere went down to dinner this stuff! Okay. I went on a blouse using the other would say hi to his new hand thought we were just instage.
Mike: That'll work. Most mom's up from The P... (to Monica)This is exactly, he is! Oh, slut, I don't know if it was so delling for me like that.
Ross: Look Rach, I look like we're not sitting   two   donow, so if how many uhm died  wow, I love you.
[cut to the hall]
Richard: Sandy. (Leaves.)
Joey: Is that a candle? OK, so who's not proud to work? Its us the game!
Chandler: You care en

In [0]:
model.save('Model_2_LSTM.h5')

In [40]:
os.listdir('./training_checkpoints')

['ckpt_21.index',
 'ckpt_26.index',
 'ckpt_21.data-00000-of-00002',
 'ckpt_36.data-00001-of-00002',
 'ckpt_29.data-00001-of-00002',
 'ckpt_19.data-00001-of-00002',
 'ckpt_28.data-00000-of-00002',
 'ckpt_32.data-00000-of-00002',
 'ckpt_5.data-00001-of-00002',
 'ckpt_13.data-00001-of-00002',
 'ckpt_4.data-00001-of-00002',
 'ckpt_8.data-00000-of-00002',
 'ckpt_33.data-00001-of-00002',
 'ckpt_14.data-00000-of-00002',
 'ckpt_23.data-00001-of-00002',
 'ckpt_12.data-00001-of-00002',
 'ckpt_27.data-00000-of-00002',
 'ckpt_36.data-00000-of-00002',
 'ckpt_1.data-00001-of-00002',
 'ckpt_11.data-00001-of-00002',
 'ckpt_13.index',
 'ckpt_14.index',
 'ckpt_11.data-00000-of-00002',
 'ckpt_19.index',
 'ckpt_20.data-00000-of-00002',
 'ckpt_9.data-00001-of-00002',
 'ckpt_3.data-00001-of-00002',
 'ckpt_33.index',
 'ckpt_25.data-00001-of-00002',
 'ckpt_29.data-00000-of-00002',
 'ckpt_17.data-00001-of-00002',
 'ckpt_15.data-00000-of-00002',
 'ckpt_37.data-00001-of-00002',
 'ckpt_27.index',
 'ckpt_2.data-00

In [0]:
from google.colab import files
files.download('Model_2_LSTM.h5')

In [48]:
files.download('./training_checkpoints/ckpt_37.data-00000-of-00004')

FileNotFoundError: ignored