# Lyrical RNN
### JT Wolohan, Uteerna Koul and Paritosh Prakash

**{jwolohan, ukoul, pmorpari}@indiana.edu**

*[Copyright (c) 2018 - Mozilla Public License v. 2.0](https://www.mozilla.org/en-US/MPL/2.0/)*

####  Imports and data loading
We're just bringing in the data we're going to need later and some modules for our data loading steps.
The actual modeling is **way** below.

In [1]:
import csv, random, pickle
import spacy
from gensim.models.fasttext import FastText
from functools import reduce
import numpy as np
import nltk
from scipy.spatial.distance import cosine as cossim
from math import log10
import Lyrics2Vectors as l2v

In [2]:
#_nlp = spacy.load("en_core_web_md")

In [3]:
lyricVectors = FastText.load("LyricVectors.pkl")

In [4]:
titlesAndLyrics = l2v.loadTitlesLyrics()
#lyr_idfs = pickle.load(open("LyricTokenIDFs.pkl","rb"))

#### Lyr2Mat Class
This class is the workhorse of our preprocessing efforts. Feed this class with a spacy NLP model, a word vector model, and a dict of IDF scores from the training data and it'll be able to produce input matricies for use in the sequence to sequence RNN.

In [5]:
class Lyr2Mat:
  def __init__(self,nlp,vecs,idfs):
    self._nlp = nlp
    self._wv = vecs
    self._idfs = idfs
    self._lyrics_tf = {}
    self.x_pad = 300
    self.y_pad = 10
  def _cleanLookup(self,tkn):
    if tkn in self._wv.vocab.keys():
      v = self._wv[tkn]
    else:
      v = np.zeros(100)
    return v
  def vectorize(self,token):
    """Turns token into wordspace, pos, tf, idf vector"""
    tkn = token
    idf = np.array(self._idfs.get(tkn,0))
    tf = np.array(log10(1+self._lyrics_tf.get(tkn,0)))
    v = self._cleanLookup(tkn)
    return np.hstack((v,tf,idf))
  def title2Seq(self,title):
    """Converts a title to a title matrix"""
    M = np.full((self.y_pad,100),0.0)
    for i,tkn in enumerate(nltk.tokenize.word_tokenize(title)):
      if i < self.y_pad:
        M[i] = self._cleanLookup(tkn)
      else:
        return M
    return M
  def decodeMatrix(self,M):
    output = []
    for row in M:
      new,dist = self._wv.similar_by_vector(row,topn=1)[0]
      if dist >= .39:
        output.append(new)
      else:
        return " ".join(output)
  def creatify(self,words,creative="happy"):
    return " ".join([lyricVectors.wv.most_similar(positive=[word,creative])[0][0] for word in words.split()])
  def createInputMatrix(self,lyrics):
    """Converts lyrics into input matrix"""
    M = np.full((self.x_pad,102),0.0)
    tokens = nltk.tokenize.word_tokenize(lyrics)
    self._lyrics_tf = {x:tokens.count(x) for x in set(tokens)}
    for i,token in enumerate(tokens):
      if i < self.x_pad:
        M[i] = self.vectorize(token)
      else:
        return M
    return M

##### Lyr2Mat example

In [75]:
# Global Variables
N = 100
nums = random.sample(range(len(titlesAndLyrics['Lyrics'])),N)
#L2M = Lyr2Mat(_nlp,lyricVectors.wv,lyr_idfs)
lyrics = [titlesAndLyrics['Lyrics'][i][:100] for i in nums]
titles = ["strt "+ titlesAndLyrics['Titles'][i] + " stp" for i in nums]
#inputs = np.array([L2M.createInputMatrix(lyrics[i]) for i in range(N)])
#targets = np.array([L2M.title2Seq(titles[i]) for i in range(N)])

### Actual Seq2Seq RNN begins here...

In [22]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import functools

In [77]:
batch_size = 25  # Batch size for training.
epochs = 4  # Number of epochs to train for.
latent_dim = 100  # Latent dimensionality of the encoding space.
num_samples = N # Number of samples to train on.

def flattenWords(left,right):
  return left + nltk.tokenize.word_tokenize(right)

# Vectorize the data.
input_texts = lyrics[:num_samples]
target_texts = titles[:num_samples]
input_characters = set(functools.reduce(flattenWords,input_texts,[]))
target_characters = set(functools.reduce(flattenWords,target_texts,[]))

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(nltk.tokenize.word_tokenize(input_text)):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(nltk.tokenize.word_tokenize(target_text)):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
#model.save('s2s.h5')

# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models

Number of samples: 100
Number of unique input tokens: 826
Number of unique output tokens: 223
Max sequence length for inputs: 100
Max sequence length for outputs: 46
Train on 80 samples, validate on 20 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe02bf82ba8>

In [85]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['strt']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        if sampled_char=="stp" and len(decoded_sentence)<15:
            sampled_char=random.choice(list(target_token_index.keys()))
        decoded_sentence += sampled_char+" "
        
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == 'stp' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(50):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index+2]
    decoded_sentence = decode_sequence(input_seq )
    #print('Input sentence:', input_texts[seq_index])
    print(f"{titles[seq_index]:<50}{decoded_sentence}")

strt Kaddish stp                                  Feels ? ' Help stp 
strt Ain't That So stp                            Goodbye Call Never stp 
strt Barstool And Dreamers stp                    Strictly But Can stp 
strt All Tore Up stp                              Celebrate Trouble stp 
strt California stp                               Call Shape Sparks stp 
strt Any Other Day stp                            Downtown Save Gone stp 
strt I Hear Music stp                             've Matter Searching stp 
strt Give It Up stp                               Soul In About Diggin stp 
strt In the Morning stp                           Doubt Sera Feeling stp 
strt Bitches stp                                  Gone Catch Dog stp 
strt Do What You Like stp                         Has Diggin Can stp 
strt Searching stp                                Keeps Loving Heart stp 
strt Dirty Dancer stp                             Nobody Broken Hymn stp 
strt Master Of Sparks stp                         