In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import pandas as pd
import os
import time


In [2]:
lyrics = pd.read_csv('../input/eminem.csv')
lyrics.head()

Unnamed: 0.1,Unnamed: 0,text,Song,Album,Text_lenght,Year
0,0,"Eminem]\nPeace to Thirstin Howl, A.L. and Word...",eminem and dilated peoples freestyle,Miscellaneous,2156,
1,1,\nI reckon you ain't familiar with these here ...,bad meets evil,The Slim Shady LP,4064,
2,2,\nLately I've been hard to reach\nI've been to...,beautiful,Relapse,4927,2009.0
3,3,\nEminem\nMiscellaneous\nB-Rabbit Vs. Papa Doc...,b rabbit vs papa doc freestyle from 8 mile,Miscellaneous,1646,
4,4,\nEminem\nMiscellaneous\nEminem Exclusive Free...,eminem exclusive freestyle,Miscellaneous,1244,


In [3]:
lyrics_text = lyrics['text']
lyrics_text.head()

0    Eminem]\nPeace to Thirstin Howl, A.L. and Word...
1    \nI reckon you ain't familiar with these here ...
2    \nLately I've been hard to reach\nI've been to...
3    \nEminem\nMiscellaneous\nB-Rabbit Vs. Papa Doc...
4    \nEminem\nMiscellaneous\nEminem Exclusive Free...
Name: text, dtype: object

In [4]:
text = '\n\n'.join(lyrics_text)
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1769150 characters


In [5]:
# Take a look at the first 250 characters in text
print(text[:250])

Eminem]
Peace to Thirstin Howl, A.L. and Wordsworth
My mother smoked crack, I had a premature birth
I'm just a nerd cursed, wit badly disturbed nerves
Who wanna be the one to step up and get served first?
Ninety-nine percent of aliens prefer earth
So


In [6]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

111 unique characters


In [7]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '#' :   4,
  '$' :   5,
  '%' :   6,
  '&' :   7,
  "'" :   8,
  '(' :   9,
  ')' :  10,
  '*' :  11,
  '+' :  12,
  ',' :  13,
  '-' :  14,
  '.' :  15,
  '/' :  16,
  '0' :  17,
  '1' :  18,
  '2' :  19,
  ...
}


In [8]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'Eminem]\nPeace' ---- characters mapped to int ---- > [37 75 71 76 67 75 60  0 48 67 63 65 67]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

Instructions for updating:
Colocations handled automatically by placer.
E
m
i
n
e


In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'Eminem]\nPeace to Thirstin Howl, A.L. and Wordsworth\nMy mother smoked crack, I had a premature birth\nI'
"'m just a nerd cursed, wit badly disturbed nerves\nWho wanna be the one to step up and get served firs"
"t?\nNinety-nine percent of aliens prefer earth\nSo I'm here to rule the planet, startin wit your turf\nI"
' hid a secret message inside of a wordsearch\nWit smeard letters, runnin together in blurred spurts\nI '
'hang wit male chauvinist pigs and perverts\nWho point water pistols at women and squirt shirts\nBeen a '


In [11]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [12]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'Eminem]\nPeace to Thirstin Howl, A.L. and Wordsworth\nMy mother smoked crack, I had a premature birth\n'
Target data: 'minem]\nPeace to Thirstin Howl, A.L. and Wordsworth\nMy mother smoked crack, I had a premature birth\nI'


In [13]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 37 ('E')
  expected output: 75 ('m')
Step    1
  input: 75 ('m')
  expected output: 71 ('i')
Step    2
  input: 71 ('i')
  expected output: 76 ('n')
Step    3
  input: 76 ('n')
  expected output: 67 ('e')
Step    4
  input: 67 ('e')
  expected output: 75 ('m')


In [14]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [18]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

In [19]:
for input_example_batch, target_example_batch in dataset.take(1): 
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 111) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           28416     
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 111)           113775    
Total params: 4,080,495
Trainable params: 4,080,495
Non-trainable params: 0
_________________________________________________________________


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [22]:
sampled_indices

array([ 89,  59,  57,  77,  69,  99,  21,  56,  98,  83,  96,  77,  93,
        94,  32,  32,  97,  68,   1,  91,  61,  83, 106, 102,  66,  23,
        45,  37, 108,  56,  99,  59,  16,  70,  13, 105,  52, 105,  53,
        45,  94,  51,  41,  59,  98, 103,  40, 101,  77,  62,  93,  44,
        76,  38,  33,  65,  68,  13,  56,  30, 106,  96,  94,   2,  16,
         1,  46,  53,  93,  44,  88,  72,   3, 110,  54, 101,  81,  63,
        13,  82,  76,  25,  40,  98,  53,  82,  16,  85, 108, 109,  12,
        73, 103,  35,  64,  76,  74,  53,  47,  44])

In [23]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 " no difficulty multi-taskin' and jugglin' both\nPerhaps mastered his craft slash entrepreneur\nWho has"

Next Char Predictions: 
 '{\\Yogä4Xâu½o\xa0©??Ãf ~_u…‘d6ME€Xä\\/h,”T”UM©SI\\â’H–o`\xa0LnFAcf,X=…½©!/ NU\xa0Lzj"\ufeffV–sa,tn8HâUt/w€☆+k’CbnlUOL'


In [24]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)") 
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 111)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.7096953


In [25]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [26]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [27]:
EPOCHS=3
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

Epoch 1/3
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Epoch 2/3
Epoch 3/3


In [28]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_3'

In [29]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [30]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            28416     
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 111)            113775    
Total params: 4,080,495
Trainable params: 4,080,495
Non-trainable params: 0
_________________________________________________________________


In [31]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)
    
    # Number of characters to generate
    num_generate = 1000
    
    # Converting our start string to numbers (vectorizing) 
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    # Empty string to store our results
    text_generated = []
    
    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.3
    
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [32]:
print(generate_text(model, start_string=u"I'm a "))

Instructions for updating:
Use tf.random.categorical instead.
I'm a double for the same thing

What the fuck you to see how they came to say that I was sit back

I say I am what you know what the fuck it was so you guess what the fuck you fuckin' shit and she say that you can see her who show

I don't know what the fuck down to see the shit down to see (who they can see they call me a start
(What?)
My name is... (What?) My name is... (HA-HA-HA-HA-HA-HA-HA)
Go a little battle who say I am
If I was a signed in the party of the man and shit you face and burning in the back of the shit down to see the shit down to see me out when I don't know what the fuck it was a sick that I don't know what the fuck down
So I don't know what the fuck it was so bad

I'm not a motherfucker you say I am
If I was a stand up

So where the fuck that I don't know what I don't know what you just a little bit of the shit with a stand up
I'm straight of my tear that you fuckin' back

So I can't be a little shit th