# Lyrics Generation with Deep Learning
###### Serkan UYSAL - [Data Source](https://www.kaggle.com/tgdivy/poetry-foundation-poems)

In [1]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Loading and preprocessing aşamaları

In [2]:
translator = str.maketrans('', '', string.punctuation)

In [3]:
pdf = pd.read_csv('PoetryFoundationData.csv', quotechar='"', index_col=0)
pdf.head()

Unnamed: 0,Title,Poem,Poet,Tags
0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


In [4]:
pdf['single_text'] = pdf['Poem'].apply(lambda x: ' \n '.join([l.lower().strip().translate(translator) for l in x.splitlines() if len(l)>0]))
pdf.head()

Unnamed: 0,Title,Poem,Poet,Tags,single_text
0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,,dog bone stapler \n cribbage board garlic pres...
1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,,the old cupola glinted above the clouds shone ...
2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,,look for me under the hood \n of that old chev...
3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,,behind the silo the mother rabbit \n hunches l...
4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,,when i push your button \n you fly off the han...


In [5]:
df = pd.DataFrame(pdf['single_text'])
df.dropna(inplace=True)

In [6]:
df.head()

Unnamed: 0,single_text
0,dog bone stapler \n cribbage board garlic pres...
1,the old cupola glinted above the clouds shone ...
2,look for me under the hood \n of that old chev...
3,behind the silo the mother rabbit \n hunches l...
4,when i push your button \n you fly off the han...


## RNN yapısı oluşturma

In [7]:
MIN_FREQ = 7
MIN_SEQ = 5
BATCH_SIZE = 64

text_as_list = []
freq = {}
uncommon_words = ()

In [8]:
def extract_text(text):
    global text_as_list
    text_as_list += [w for w in text.split(' ') if w.strip() != '' or w =='\n']

In [9]:
df['single_text'].apply(extract_text)

for w in text_as_list:
    freq[w] = freq.get(w, 0) + 1

In [10]:
uncommon_words = set([key for key in freq.keys() if freq[key] < MIN_FREQ])
words = sorted(set([key for key in freq.keys() if freq[key] >= MIN_FREQ]))

num_words = len(words)

word_indices = dict((w, i) for i, w in enumerate(words))
indices_word = dict((i, w) for i, w in enumerate(words))

In [11]:
valid_seq = []
end_seq_words = []

for i in range(len(text_as_list) - MIN_SEQ):
    end_slice = i + MIN_SEQ + 1
    if len(set(text_as_list[i:end_slice]).intersection(uncommon_words))==0:
        valid_seq.append(text_as_list[i: i+ MIN_SEQ])
        end_seq_words.append(text_as_list[i + MIN_SEQ])

In [12]:
print('Valid sequences of size {}: {}'.format(MIN_SEQ, len(valid_seq)))

Valid sequences of size 5: 2774707


In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(valid_seq, end_seq_words, test_size=0.02, random_state=25) # why we always use 42 for random_state

In [14]:
print(X_train[2:5])

[['do', 'feign', '\n', 'could', 'once'], ['it', 'among', 'engraved', 'furrows', '\n'], ['songs', 'torn', 'from', 'the', 'air']]


In [15]:
# DataGenerator
def generator(sentece_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, MIN_SEQ), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        
        for i in range(batch_size):
            for t, w in enumerate(sentece_list[index % len(sentece_list)]):
                x[i, t] = word_indices[w]
            
            y[i] = word_indices[next_word_list[index % len(sentece_list)]]
            index += 1
        yield x, y

In [16]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    
    exp_preds = np.exp(preds)
    probas = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

In [17]:
def on_epoch_end(epoch, logs):
    seed_index = np.random.randint(len(X_train+X_test))
    seed = (X_train + X_test)[seed_index]
    
    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))
        
        for i in range(50):
            x_pred = np.zeros((1, MIN_SEQ))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]
            
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            
            sentence = sentence[1:]
            sentence.append(next_word)
            examples_file.write(" " + next_word)
        
        examples_file.write('\n')
        
    examples_file.write('='*25+'\n')
    examples_file.flush()

In [18]:
def generate_model():
    return tf.keras.models.Sequential([
        tf.keras.layers.Embedding(
            input_dim=len(words),
            output_dim=1024
        ),
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(128)
        ),
        tf.keras.layers.Dense(
            len(words)
        ),
        tf.keras.layers.Activation('softmax')]
    )

#### Model compile section

In [19]:
model = generate_model()
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['acc']
)
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
           "loss{loss:.4f}-acc{accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_accuracy:.4f}" % \
           (len(words), MIN_SEQ, MIN_FREQ)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=file_path,
    monitor='val_accuracy',
    save_best_only=True
)

print_callback = tf.keras.callbacks.LambdaCallback(
    on_epoch_end=on_epoch_end
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val-accuracy',
    patience=25
)

callbacks_list = [checkpoint, print_callback, early_stopping]

examples_file = open('zs_example.md', "w")

In [20]:
hist = model.fit(
    generator(X_train, y_train, BATCH_SIZE),
    steps_per_epoch=int(len(valid_seq)/BATCH_SIZE + 1),
    epochs=25,
    callbacks=callbacks_list,
    validation_data=generator(X_test, y_test, BATCH_SIZE),
    validation_steps=int(len(y_train)/BATCH_SIZE + 1)
)

Epoch 1/25
  413/43355 [..............................] - ETA: 2:43:34 - loss: 7.7531 - acc: 0.0945

KeyboardInterrupt: 

In [21]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [22]:
tf.config.list_physical_devices('GPU')

[]