# NLP project: lyric generation with LSTM model

data from [kaggle](https://www.kaggle.com/mousehead/songlyrics)

In [1]:
import pandas as pd
import numpy as np
import random
import re
from collections import Counter
from numpy.random import choice

In [2]:
datapath = '/Users/weitinglin/Downloads/songdata.csv'

In [5]:
full_data = pd.read_csv(datapath)

In [6]:
# for debugging
sub_data = full_data[:100]
tiny_data = full_data[:10]

In [7]:
tiny_data.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [8]:
def lyric_token(song):
    '''
    Add <eos> token (end of sentence)
    Keep words with "'" combination (e.g. "don't")
    '''
    song = ' '.join([w for w in re.split('[^a-zA-Z]', song) if w])
#     song = ''.join([w for w in song if w not in ['\'','\"', ',', '-', '.', '!','?','(',')','[', ']',':']])
    sentences = [s.strip().lower() + ' <eos>' for s in song.strip().split('\n')]
    lyric = []
    for sentence in sentences:
        lyric = lyric + [s.strip()  for s in sentence.strip().split(' ')]
    return lyric

In [149]:
# lyric_token(tiny_data['text'][2])

### token to index 

In [63]:
def make_vocab(text_list, min_freq = 1):
    '''
    Make the dictionary
    '''
    cnt = Counter()
    for s in text_list:
        token = lyric_token(s)
        for t in token:
            cnt[t] += 1
    

    vocab = [v[0] for v in sorted(cnt.items(), key =lambda x: x[1]) if v[1] >= min_freq]
    print('total vocab:', len(vocab))
    vocab_indices = dict((v, i) for i, v in enumerate(vocab))
    indices_vocab = dict((i, v) for i, v in enumerate(vocab))
    return cnt, vocab, vocab_indices, indices_vocab

In [67]:
lyric_token(['a a b c'])

TypeError: expected string or bytes-like object

In [76]:
cnt, vocab, vocab_indices, indices_vocab = make_vocab(tiny_data['text'])


total vocab: 485


In [77]:
cnt, vocab, vocab_indices, indices_vocab = make_vocab(tiny_data['text'], min_freq=2)
# temp[:100]

total vocab: 281


In [30]:
vocab_indices["love"]
# indices_vocab[50]

468

In [35]:
"lovers" in vocab_indices

False

### Vectorization (and build x, y data for text generator)

[ref.](https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py)

In [105]:
# lyric_token(song)[:10]

In [32]:
primer_length = 10
step_size = 3

In [79]:
def make_train_data(text_list, primer_len = 10, step = 3):
    primers = []
    next_words = []

    # make primer-next_word pairs
    for song in text_list:
        # for each song
        song_tokens = lyric_token(song)
        for i in range(0, len(song_tokens) - primer_len, step):
            # sliding window
            primers.append(song_tokens[i: i + primer_len])
            next_words.append(song_tokens[i + primer_len])
    # tokenization
    x = np.zeros((len(primers), primer_len, len(vocab)), dtype=np.bool)
    y = np.zeros((len(primers), len(vocab)), dtype=np.bool)
    for i, sentence in enumerate(primers):
        for t, word in enumerate(sentence):
            if word in vocab:
                x[i, t, vocab_indices[word]] = 1
        if next_words[i] in vocab:
            y[i, vocab_indices[next_words[i]]] = 1
        
    return x, y

In [80]:
x,y = make_train_data(tiny_data['text'], primer_len = primer_length, step = step_size)

In [81]:
def primer_vec(p, primer_len):
    '''
    vectorize the primer
    '''
    sentence = lyric_token(p)[-primer_len:]
    x = np.zeros((1, primer_len, len(vocab)), dtype=np.bool)
    for t, word in enumerate(sentence):
        if word in vocab_indices:
            x[0, t, vocab_indices[word]] = 1
    return x

## Build LSTM model with keras and train with tiny data 
[ref.](https://stackoverflow.com/questions/50090173/how-to-give-input-to-the-middle-layer-in-keras) about concatenate

In [39]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Concatenate
from keras.optimizers import RMSprop

In [40]:
n_a = 64 # number of hidden LSTM

In [41]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(n_a, input_shape=(primer_length, len(vocab))))
model.add(Dense(len(vocab), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [42]:
model.fit(x, y,
          batch_size=128,
          epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11c7f7c18>

### Text generator

#### sample

In [82]:
def draw(preds, temperature = 1.0):
    '''
    preds: the raw output from model
    temperature: larger number means more random (equalized prob.)
    '''
    pred_exp = np.exp(np.log(preds)/ temperature)
    prob = pred_exp / sum(pred_exp)
    draw = choice(np.array(len(preds)), 1, p=prob)
    return draw[0]

In [6]:
# draw(model.predict(x[:1])[0])

In [83]:
# make function to vectorize new examples
# should handle new words

def gen_next_word(my_primer_vec,  temperature = 1.0):
    my_prob = model.predict(my_primer_vec)[0]
    next_word = indices_vocab[draw(my_prob, temperature)]
    return next_word
# draw(my_prob)

In [45]:
my_primer1 = "Look! It's a beautiful day, let's go hiking, shell we?"
my_primer_vec1 = primer_vec(my_primer1, primer_length)
gen_next_word(primer_vec(my_primer1, primer_length))

'be'

In [46]:
## make function to generate a song

def gen_song(my_primer, primer_len, song_len,  temperature = 1.0):
    my_primer_vec = primer_vec(my_primer, primer_len)
    my_song = my_primer
    for i in range(song_len - primer_len):
        my_song += ' ' + gen_next_word(my_primer_vec, temperature)
    
    return my_song
    
    

In [47]:
gen_song(my_primer1, 10, 20, temperature = 1)

"Look! It's a beautiful day, let's go hiking, shell we? s slow burning feel sweet without see last saw it"

## train bigger model

In [84]:
len(full_data)

57650

In [187]:
# sub_data = full_data[:10000]

In [86]:
# when would vacabulary saturate?
for idx in [3000]:
    cnt, vocab, vocab_indices, indices_vocab = make_vocab(full_data[:idx]['text'], min_freq=2)

total vocab: 9391


In [87]:
text_input = sub_data['text']
primer_length = 8
step_size = 2
n_a = 32

_, vocab, vocab_indices, indices_vocab = make_vocab(text_input)
x, y = make_train_data(text_input, primer_len = primer_length, step = step_size)
print("total example: {}".format(y.shape[0]))

total vocab: 2129
total example: 13007


In [88]:
print('Build model...')
model = Sequential()
model.add(LSTM(n_a, input_shape=(primer_length, len(vocab))))
model.add(Dense(len(vocab), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 32)                276736    
_________________________________________________________________
dense_3 (Dense)              (None, 2129)              70257     
Total params: 346,993
Trainable params: 346,993
Non-trainable params: 0
_________________________________________________________________


In [89]:
model.fit(x, y,
          batch_size=128,
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x100ddd1d0>

In [90]:
my_primer1 = "Let me write a song for you, a lovely song <eos>"

In [95]:
gen_song(my_primer1, primer_length, 20, temperature = 2)

'Let me write a song for you, a lovely song <eos> hang that here lovers every sometimes i know turn time keep help'