In [1]:
import numpy as np
import pandas as pd
import keras

from keras.layers import Dense, TimeDistributed, Activation
from keras.layers import LSTM, Dropout

import matplotlib.pyplot as plt

np.random.seed(1919788)

Using TensorFlow backend.


# Usefull functions

In [2]:
#load dictionnaries :
char_to_ix = np.load('char_to_ix.npy').item()
ix_to_char = np.load('ix_to_char.npy').item()
chars = np.load('chars.npy').item()

In [3]:
def load_data(csv_path, artists = None) :
    """read csv file with pandas
    keep specific songs of an artist
    return df and number of songs"""
    
    data = pd.read_csv("songdata.csv")
    if artists :
        data = data[data.artist.isin(artists)].reset_index()
    n_songs = len(data)
    return data, n_songs

In [4]:
def seq_encoder(song, char_to_ix):
    """change str characters with matching encode number"""
    encoded = [char_to_ix[char] for _,char in enumerate(song)]

    return encoded

In [5]:
def split_lines(song) :
    """separate lines of a song
    input : lyrics (str)
    output : list of every lines"""
    
    song_lyrics = []
    line = ""

    for i in range(len(song)):
        #add the caracter to the line
        line+=song[i]
        #check two following caracters to spot the \\n
        if song[i]=="\n" :
            #cut the str to end the line
            song_lyrics.append(line[:len(line)-2])
            #start another line
            line =''
            
    return song_lyrics

In [6]:
def split_sequences(encoded_text, sequence_length):
    sequences = []
    n_sequences = int(len(encoded_text)/sequence_length)
    for i in range(n_sequences):
        sequences.append(encoded_text[i*sequence_length:(i+1)*sequence_length])
    return(sequences)

In [7]:
def pad_sequence(seq, max_length, pad_label=100):
    
    seq += [pad_label for i in range(max_length - len(seq))]
    
    return seq

In [8]:
def concat_str(n_songs, df):
    """concatenate lyrics from the n_songs of a df"""
    songs_conc = ""
    for i in range(n_songs):
        songs_conc += df.iloc[i].text
    return songs_conc

In [9]:
def embeds_array(array, n_chars, n_sequences, max_sequence_length):
    """inputs : 2 dim array of padded encoded sequences and number of char
    outputs : 3 dim array, replace each encoded char by a one hot vector"""
    
    #create an empty array of the rigth dimension
    output = np.zeros((n_sequences, max_sequence_length, n_chars+1))

    for seq in range(n_sequences):
        for char in range(max_sequence_length):
            label = array[seq][char]
            #100 = pad label
            if label != 100 : 
                output[seq][char][label] = 1
            else :
                #replace the last number by 1 (pad label)
                output[seq][char][-1] = 1
    
    return output

In [10]:
def embeds_array_wp(array, n_chars, n_sequences, max_sequence_length):
    """inputs : 2 dim array of padded encoded sequences and number of char
    outputs : 3 dim array, replace each encoded char by a one hot vector"""
    
    #create an empty array of the rigth dimension
    output = np.zeros((n_sequences, max_sequence_length, n_chars))

    for seq in range(n_sequences):
        for char in range(max_sequence_length):
            label = array[seq][char] 
            output[seq][char][label] = 1
    
    return output

# Preprocessing data

In [11]:
def shaping_data(csv_path, artists, char_to_ix, padding = True, max_sentence_length = 50):
    
    #load the songs
    df, n_songs = load_data(csv_path,artists)
    
    #gather all the songs in one big str
    songs_conc = concat_str(n_songs, df)
    
    if padding :
        #create a list of the lines of our dataset
        sequences = split_lines(songs_conc)
        #find the longest sentence :
        max_sentence_length = len(max(sequences, key=len))
        #change char for matching char to ix index
        encoded_sequences = [seq_encoder(sequences[i], char_to_ix) for i in range(len(sequences))]
        #remove empty sequences
        encoded_sequences = [encoded_sequences[i] for i in range(len(encoded_sequences)) if len(encoded_sequences[i])>1]
        #pad sequences according to the longest setence in the all df 
        #in this way the nn dimensions are independent of the chosen artist
        encoded_sequences = [pad_sequence(encoded_sequences[i], max_sentence_length) for i in range(len(encoded_sequences))]
        
    else :
        #change char for matching char_to_ix index
        encoded_sequences = seq_encoder(songs_conc, char_to_ix)
        #split text in size-learnable sequences
        encoded_sequences = split_sequences(encoded_sequences, sequence_length)
        
    #prepare the input and output sequences by slicing the input by one character :
    n_sequences = len(encoded_sequences)

    #slice inputs and outputs by one char
    inputs = [encoded_sequences[i][:-1] for i in range(n_sequences)]
    outputs = [encoded_sequences[i][1:] for i in range(n_sequences)]
    
    #embeds the inputs and outputs
    inputs_embedded = embeds_array(array = inputs, n_chars = len(char_to_ix), 
                               n_sequences = n_sequences, 
                               max_sequence_length = max_sentence_length-1)

    outputs_embedded = embeds_array(array = outputs, n_chars = len(char_to_ix), 
                               n_sequences = n_sequences, 
                               max_sequence_length = max_sentence_length-1)
    
    return(inputs_embedded, outputs_embedded, max_sentence_length)

In [13]:
#process data
artists = ['Elton John', 'Eminem','Queen']
inputs_embedded, outputs_embedded, max_sentence_length = shaping_data("songdata.csv",artists,char_to_ix)

# LSTM network

In [14]:
#parameters :
hidden_dim = 514
num_layers = 1
vocab_size = len(char_to_ix)
sentence_length = max_sentence_length -1 #-1 for slicing

In [15]:
model = keras.models.Sequential()

model.add(LSTM(hidden_dim, input_shape=(None, vocab_size+1), return_sequences=True))
model.add(Dropout(0.3))

#TimeDistributed to do many to many
model.add(TimeDistributed(Dense(vocab_size+1)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="Adam")

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 514)         1217152   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 514)         0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 77)          39655     
_________________________________________________________________
activation_1 (Activation)    (None, None, 77)          0         
Total params: 1,256,807
Trainable params: 1,256,807
Non-trainable params: 0
_________________________________________________________________
None


# Lyrics Generator

In [25]:
def lyrics_generator(model, length, vocab_size, first_word=None, padding = True):
    
    if padding :
        #add pad label to ix_to_char
        ix_to_char[76]="*"
    
    sentence = np.zeros((1, length, vocab_size))
    
    if not first_word :
        #label of the first character
        ix = [np.random.randint(1,vocab_size)]
        y_char = [ix_to_char[ix[-1]]]
        
    else :
        y_char = [char for _,char in enumerate(first_word)]
        ix = seq_encoder(first_word, char_to_ix)
        
    #fill the first characters
    for i in range(len(ix)):
        sentence[0, i, :][ix[i]] = 1

    for i in range(len(ix),length):
        #starting from 
        #fill the one hot vector of the corresponding caracter
        sentence[0, i, :][ix[-1]] = 1
        
        ix = np.argmax(model.predict(sentence[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
        
    lyrics = ''
    for i in range(length):
        lyrics += y_char[i]
        
    return lyrics

# Training network

In [18]:
n_epochs = 50

for epoch in range(n_epochs):
    print('***** epoch '+str(epoch)+" *****")
    model.fit(inputs_embedded, outputs_embedded, batch_size=32, verbose=1)
    #if epoch % 10 == 0:
    #    model.save_weights('checkpoint_{}_epoch_{}.hdf5'.format(hidden_dim, epoch))

***** epoch 0 *****
Epoch 1/1
***** epoch 1 *****
Epoch 1/1
***** epoch 2 *****
Epoch 1/1
***** epoch 3 *****
Epoch 1/1
***** epoch 4 *****
Epoch 1/1
***** epoch 5 *****
Epoch 1/1
***** epoch 6 *****
Epoch 1/1
***** epoch 7 *****
Epoch 1/1
***** epoch 8 *****
Epoch 1/1
***** epoch 9 *****
Epoch 1/1
***** epoch 10 *****
Epoch 1/1
***** epoch 11 *****
Epoch 1/1
***** epoch 12 *****
Epoch 1/1
***** epoch 13 *****
Epoch 1/1
***** epoch 14 *****
Epoch 1/1
***** epoch 15 *****
Epoch 1/1
***** epoch 16 *****
Epoch 1/1
***** epoch 17 *****
Epoch 1/1
***** epoch 18 *****
Epoch 1/1
***** epoch 19 *****
Epoch 1/1
***** epoch 20 *****
Epoch 1/1
***** epoch 21 *****
Epoch 1/1
***** epoch 22 *****
Epoch 1/1
***** epoch 23 *****
Epoch 1/1
***** epoch 24 *****
Epoch 1/1
***** epoch 25 *****
Epoch 1/1
***** epoch 26 *****
Epoch 1/1
***** epoch 27 *****
Epoch 1/1
***** epoch 28 *****
Epoch 1/1
***** epoch 29 *****
Epoch 1/1
***** epoch 30 *****
Epoch 1/1
***** epoch 31 *****
Epoch 1/1
***** epoch 32 ***

In [19]:
#model.save_weights('checkpoint_lyrgen.hdf5')

In [31]:
new_song = lyrics_generator(model, 50, len(char_to_ix)+1)

In [32]:
print(new_song)

 and to say you and they go to the shave me ******
