In [1]:
import numpy as np
import pandas as pd

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

import keras

import torch.utils.data as data

from torch.autograd import Variable

np.random.seed(1919788)

Using TensorFlow backend.


In [2]:
#for padding purpuses, with have been once through out the  all corpus to find the longest sentence :
max_sentence_length = 114

#load dictionnaries :
char_to_ix = np.load('char_to_ix.npy').item()
ix_to_char = np.load('ix_to_char.npy').item()
chars = np.load('chars.npy').item()

In [3]:
def seq_encoder(song, char_to_ix):
    """change str characters with matching encode number"""
    encoded = [char_to_ix[char] for _,char in enumerate(song)]

    return encoded

In [4]:
def load_data(csv_path, artists = None) :
    """read csv file with pandas
    keep specific songs of an artist
    return df and number of songs"""
    
    data = pd.read_csv("songdata.csv")
    if artists :
        data = data[data.artist.isin(artists)].reset_index()
    n_songs = len(data)
    return data, n_songs

In [5]:
def split_lines(song) :
    """separate lines of a song
    input : lyrics (str)
    output : list of every lines"""
    
    song_lyrics = []
    line = ""

    for i in range(len(song)):
        #add the caracter to the line
        line+=song[i]
        #check two following caracters to spot the \\n
        if song[i]=="\n" :
            #cut the str to end the line
            song_lyrics.append(line[:len(line)-2])
            #start another line
            line =''
            
    return song_lyrics

In [6]:
def split_sequences(encoded_text, sequence_length):
    sequences = []
    n_sequences = int(len(encoded_text)/sequence_length)
    for i in range(n_sequences):
        sequences.append(encoded_text[i*sequence_length:(i+1)*sequence_length])
    return(sequences)

In [7]:
def pad_sequence(seq, max_length, pad_label=100):
    
    seq += [pad_label for i in range(max_length - len(seq))]
    
    return seq

In [8]:
def concat_str(n_songs, df):
    """concatenate lyrics from the n_songs of a df"""
    songs_conc = ""
    for i in range(n_songs):
        songs_conc += df.iloc[i].text
    return songs_conc

In [9]:
def embeds_array(array, n_chars, n_sequences, max_sequence_length):
    """inputs : 2 dim array of padded encoded sequences and number of char
    outputs : 3 dim array, replace each encoded char by a one hot vector"""
    
    #create an empty array of the rigth dimension
    output = np.zeros((n_sequences, max_sequence_length, n_chars+1))

    for seq in range(n_sequences):
        for char in range(max_sequence_length):
            label = array[seq][char]
            #100 = pad label
            if label != 100 : 
                output[seq][char][label] = 1
            else :
                #replace the first number by 1 (pad label)
                #char_to_ix has been made to start at 1 for the first character
                output[seq][char][0] = 1
    
    return output

In [10]:
def embeds_array_wp(array, n_chars, n_sequences, max_sequence_length):
    """inputs : 2 dim array of padded encoded sequences and number of char
    outputs : 3 dim array, replace each encoded char by a one hot vector"""
    
    #create an empty array of the rigth dimension
    output = np.zeros((n_sequences, max_sequence_length, n_chars))

    for seq in range(n_sequences):
        for char in range(max_sequence_length):
            label = array[seq][char] 
            output[seq][char][label] = 1
    
    return output

# Shaping data

In [11]:
#load df
df, n_songs = load_data("songdata.csv",['Elton John', 'Eminem'])

In [12]:
sequence_length = 50
#gather songs in one big str
songs_conc = concat_str(n_songs,df)
#change char for matching char_to_ix index
encoded_sequences = seq_encoder(songs_conc, char_to_ix)
#split text in size learnable sequences
encoded_split_sequences = split_sequences(encoded_sequences, sequence_length)

In [13]:
n_sequences = len(encoded_split_sequences)

#slice inputs and outputs by one char
inputs = [encoded_split_sequences[i][:-1] for i in range(n_sequences)]
outputs = [encoded_split_sequences[i][1:] for i in range(n_sequences)]
        
#embeds the inputs and outputs
inputs_embedded = embeds_array_wp(array = inputs, n_chars = len(char_to_ix), 
                               n_sequences = n_sequences, 
                               max_sequence_length = sequence_length-1)

outputs_embedded = embeds_array_wp(array = outputs, n_chars = len(char_to_ix), 
                               n_sequences = n_sequences, 
                               max_sequence_length = sequence_length-1)

In [14]:
lit_inputs = inputs_embedded[:1000]
lit_outputs = outputs_embedded[:1000]

# LSTM network

In [15]:
#parameters :
hidden_dim = 500
num_layers = 2

In [16]:
model = keras.models.Sequential()

model.add(keras.layers.LSTM(hidden_dim, input_shape=(None, len(char_to_ix)), return_sequences=True))
for i in range(num_layers - 1):
    model.add(keras.layers.LSTM(num_layers, return_sequences=True))
    model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(len(char_to_ix))))
model.add(keras.layers.Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

# Generate lyrics

In [17]:
def generate_text(model, length, vocab_size):
    
    ix = [np.random.randint(1,vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    
    for i in range(length):
        #fill the one hot vector of the corresponding caracter
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return y_char

# Training network

In [20]:
n_epochs = 10

for epoch in range(n_epochs):
    print('***** epoch '+str(epoch)+" *****")
    model.fit(lit_inputs, lit_outputs, batch_size=int(n_sequences/4), verbose=1, nb_epoch=1)
    if epoch % 1 == 0:
        model.save_weights('checkpoint_{}_epoch_{}.hdf5'.format(hidden_dim, epoch))

***** epoch 0 *****
Epoch 1/1


  """


***** epoch 1 *****
Epoch 1/1
***** epoch 2 *****
Epoch 1/1


In [21]:
new_song = generate_text(model, 50, len(char_to_ix))

yhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh