# NLP project: lyric generation with LSTM model

data from [kaggle](https://www.kaggle.com/mousehead/songlyrics)

In [1]:
import pandas as pd
import numpy as np
import random
import re
from collections import Counter
from numpy.random import choice

In [2]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Concatenate
from keras.optimizers import RMSprop
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
datapath = '/Users/weitinglin/Downloads/songdata.csv'

In [4]:
full_data = pd.read_csv(datapath)

In [5]:
# for debugging
sub_data = full_data[:100]
tiny_data = full_data[:10]

In [6]:
# tiny_data.head(3)

## Pre-processing

### Tokenization

In [7]:
def lyric_token(song):
    '''
    Add <eos> token (end of sentence)
    Keep words with "'" combination (e.g. "don't")
    '''
    song = ' '.join([w for w in re.split('[^a-zA-Z]', song) if w])
#     song = ''.join([w for w in song if w not in ['\'','\"', ',', '-', '.', '!','?','(',')','[', ']',':']])
    sentences = [s.strip().lower() + ' <eos>' for s in song.strip().split('\n')]
    lyric = []
    for sentence in sentences:
        lyric = lyric + [s.strip()  for s in sentence.strip().split(' ')]
    return lyric

In [11]:
## example:
lyric_token(tiny_data['text'][2])[:3]

['i', 'll', 'never']

### Make vocabulary: token to index 

In [14]:
def make_vocab(text_list, min_freq = 1):
    '''
    Make the dictionary
    : min_freq: min frequency for a token to be included in vocabulary
    '''
    cnt = Counter()
    for s in text_list:
        token = lyric_token(s)
        for t in token:
            cnt[t] += 1
    

    vocab = [v[0] for v in sorted(cnt.items(), key =lambda x: x[1]) if v[1] >= min_freq]
    print('total vocab:', len(vocab))
    vocab_indices = dict((v, i) for i, v in enumerate(vocab))
    indices_vocab = dict((i, v) for i, v in enumerate(vocab))
    return cnt, vocab, vocab_indices, indices_vocab

In [10]:
cnt, vocab, vocab_indices, indices_vocab = make_vocab(tiny_data['text'])

total vocab: 485


In [15]:
cnt, vocab, vocab_indices, indices_vocab = make_vocab(tiny_data['text'], min_freq=2)
# temp[:100]

total vocab: 281


In [17]:
print(vocab_indices["love"]) # 265
print(indices_vocab[265])

265
love


## Vectorization (and build x, y pairs of training data)

[ref.](https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py)

In [105]:
# lyric_token(song)[:10]

In [31]:
def primer_vec(p, primer_len):
    '''
    vectorize the primer
    '''
    sentence = lyric_token(p)[-primer_len:]
    x = np.zeros((1, primer_len, len(vocab)), dtype=np.bool)
    for t, word in enumerate(sentence):
        if word in vocab_indices:
            x[0, t, vocab_indices[word]] = 1
    return x

## Model with batch generator

In [22]:
# for x,y in zip(X,Y):
def zip_batch_generator(items1, items2, batch_size):
    """
    Implement batch generator that yields (zipped) items in batches of size batch_size.
    """
    out1 = []
    out2 = []
    for item1, item2 in zip(items1, items2):
        out1.append(item1)
        out2.append(item2)
        if len(out1) == batch_size:
            yield out1, out2
            out1 = []
            out2 = []
    yield out1, out2

In [23]:
def make_train_data(text_list, primer_len = 10, step = 3):
    """
    create x (primer) - y (next word) pairs of training data
    """
    primers = []
    next_words = []
    # make primer-next_word pairs
    for song in text_list:
        # for each song
        song_tokens = lyric_token(song)
        for i in range(0, len(song_tokens) - primer_len, step):
            # sliding window
            primers.append(song_tokens[i: i + primer_len])
            next_words.append(song_tokens[i + primer_len])
    return primers, next_words

def train_generator(primers, next_words, batch_size):
    '''
    Tokenize the primer-next_word pairs and put into a generator
    '''
    primer_len = len(primers[0])    
    while True:
         for batch1, batch2 in zip_batch_generator(primers, next_words, batch_size):
                batch_x = np.zeros((batch_size, primer_len, len(vocab)), dtype=np.bool)
                batch_y = np.zeros((batch_size, len(vocab)), dtype=np.bool)
                for i, sentence in enumerate(batch1):
                    for t, word in enumerate(sentence):
                        if word in vocab:
                            batch_x[i, t, vocab_indices[word]] = 1
                    if next_words[i] in vocab:
                        batch_y[i, vocab_indices[next_words[i]]] = 1

                yield batch_x, batch_y

### Build LSTM model with keras and train with tiny data 
[ref.](https://stackoverflow.com/questions/50090173/how-to-give-input-to-the-middle-layer-in-keras) about concatenate

In [24]:
PRIMER_LENGTH = 10
STEP_SIZE = 3
BATCH_SIZE = 128
n_a = 64 # number of hidden LSTM

In [25]:
cnt, vocab, vocab_indices, indices_vocab = make_vocab(tiny_data['text'], min_freq=2)
primers, next_words = make_train_data(tiny_data['text'], PRIMER_LENGTH , STEP_SIZE ) 
# temp = train_generator(primers, next_words, BATCH_SIZE)

total vocab: 281


In [26]:
print('Build model...')
model = Sequential()
model.add(LSTM(n_a, input_shape=(PRIMER_LENGTH, len(vocab))))
model.add(Dense(len(vocab), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [27]:
# 
model.fit_generator(
    train_generator(primers, next_words, BATCH_SIZE), 
    steps_per_epoch=len(next_words)// BATCH_SIZE, 
    epochs=1,
)

Epoch 1/1


<keras.callbacks.History at 0x1200432b0>

### Text generator

#### sample

In [28]:
def draw(preds, temperature = 1.0):
    '''
    preds: the raw output from model
    temperature: larger number means more random (equalized prob.)
    '''
    pred_exp = np.exp(np.log(preds)/ temperature)
    prob = pred_exp / sum(pred_exp)
    draw = choice(np.array(len(preds)), 1, p=prob)
    return draw[0]

In [29]:
# make function to vectorize new examples
# should handle new words

def gen_next_word(my_primer_vec, trained_model, temperature = 1.0):
    my_prob = trained_model.predict(my_primer_vec)[0]
    next_word = indices_vocab[draw(my_prob, temperature)]
    return next_word
# draw(my_prob)

In [35]:
## make function to generate a song

def gen_song(my_primer, primer_len, song_len, trained_model,  temperature = 1.0):
    my_primer_vec = primer_vec(my_primer, primer_len)
    my_song = my_primer
    for i in range(song_len - primer_len):
        my_song += ' ' + gen_next_word(my_primer_vec, trained_model, temperature)
    
    return my_song
    

In [38]:
my_primer1 = "I saw a squirrel jumping off the tree today <eos>"
my_primer_vec1 = primer_vec(my_primer1, PRIMER_LENGTH)

gen_song(my_primer1, 10, 20,model, temperature = 1)

'I saw a squirrel jumping off the tree today <eos> don go could selfish she andante could lightly your kind'

## train bigger model: 

**only train full model on aws!**

In [39]:
len(full_data)

57650

In [93]:
sub_data = full_data[:2000]

In [41]:
# # when would vacabulary saturate? or will it
# for idx in [10000]:
#     cnt, vocab, vocab_indices, indices_vocab = make_vocab(full_data[:idx]['text'], min_freq=3)

In [94]:
PRIMER_LENGTH = 10
STEP_SIZE = 2
BATCH_SIZE = 256
n_a = 64 # number of hidden LSTM

In [95]:
text_input = sub_data['text']

cnt, vocab, vocab_indices, indices_vocab = make_vocab(text_input, min_freq=5)
primers, next_words = make_train_data(text_input, PRIMER_LENGTH , STEP_SIZE ) 
print("total example: {}".format(len(next_words)))

total vocab: 4028
total example: 217136


In [96]:
print('Build model...')
model = Sequential()
model.add(LSTM(n_a, input_shape=(PRIMER_LENGTH, len(vocab))))
model.add(Dense(len(vocab), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [47]:

# model = load_model('lyrics.h5')

In [97]:
# 
model.fit_generator(
    train_generator(primers, next_words, BATCH_SIZE), 
    steps_per_epoch=len(next_words)// BATCH_SIZE, 
    epochs=1,
)

Epoch 1/1
138/848 [===>..........................] - ETA: 733s - loss: 4.3044

KeyboardInterrupt: 

In [None]:
# 3.5962 1st epoch

In [None]:
# model.save('lyrics_10ksongs_6epoch.h5')
print('Done!')

In [85]:
my_primer1 = "I saw a squirrel jumping off the tree today <eos>"

In [None]:
gen_song(my_primer1, PRIMER_LENGTH, 30, model, temperature = 1)

In [80]:
gen_song(my_primer1, PRIMER_LENGTH, 30, model, temperature = 1)

'I saw a squirrel jumping off the tree today <eos> look girl feel could lucky i go one feel she and be holds kind we ever lucky all could she'