In [1]:
import pandas as pd
import numpy as np
import keras
import os
import pickle

Using TensorFlow backend.


In [2]:
# Get list of lyrics dataframes
train_dir = r'genres/rap/training'
dataframes = []
for file in os.listdir(train_dir):
    print(file)
    path = train_dir + file
    lyrics_df = pd.read_pickle(path)
    dataframes.append(lyrics_df)

taylor_swift_cleaned.pkl
the_killers_cleaned.pkl
the_notorious_b.i.g._cleaned.pkl
u2_cleaned.pkl
warren_zevon_cleaned.pkl


In [3]:
# Concatenate all lyrics into one string
def generate_corpus(dataframe):
    print('Compiling corpus from lyrics dataframe...')
    text = ''
    for i, lyrics in dataframe['lyrics'].iteritems():
        text += ' ' + lyrics
    text = text.lower()
    print('Corpus length:', len(text))
    
    return text

In [4]:
def prep_text_for_training(text, maxlen, step):
    print('Prepping corpus for training...')
    sentences = []
    next_chars = []

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print('Number of sequences:', len(sentences))

    chars = sorted(list(set(text)))
    print(chars)
    print('Unique characters:', len(chars))
    char_indices = dict((char, chars.index(char)) for char in chars)
        
    return chars, char_indices, sentences, next_chars

In [5]:
def vectorize_text(chars, char_indices, sentences, next_chars, maxlen=60):
    print('Vectorization...')
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    print('Done.')
    return x, y

In [6]:
from keras import layers

def get_model(maxlen=60):
    model = keras.models.Sequential()
    model.add(layers.LSTM(128, input_shape=(maxlen, len(chars)), dropout=0.1, recurrent_dropout=0.1))
    # model.add(layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
    # model.add(layers.LSTM(64, dropout=0.1, recurrent_dropout=0.1))
    model.add(layers.Dense(len(chars), activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

In [7]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

In [8]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_model(model, x, y, path):
    print('Starting model training...')
    history = model.fit(x, y,
              batch_size=128,
              epochs=60,
              callbacks=[
                  EarlyStopping(monitor='loss', patience=5, min_delta=0.001),
                  ModelCheckpoint(path, monitor='loss', save_best_only=True)
              ], verbose = 0)
    print('Done.')
    return model, history

In [9]:
for dataframe in dataframes:
    text = generate_corpus(dataframe)
    artist = dataframe.iloc[0]['artist'].lower().replace(' ', '_')
    chars, char_indices, sentences, next_chars = prep_text_for_training(text, maxlen=60, step=3)
    x, y = vectorize_text(chars, char_indices, sentences, next_chars)
    
    model = get_model()
#     os.mkdir(artist)
    trained_model, history = train_model(model, x, y, path=os.path.join(artist, 'model.h5'))
    print(f'{artist} training loss: {history.history["loss"]}')
    
    with open(os.path.join(artist, 'chars.pkl'), 'wb') as f:
        pickle.dump(chars, f)
    with open(os.path.join(artist, 'char_indices.pkl'), 'wb') as f:
        pickle.dump(char_indices, f)

Compiling corpus from lyrics dataframe...
Corpus length: 729632
Prepping corpus for training...
Number of sequences: 243191
[' ', '!', '"', "'", ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Unique characters: 43
Vectorization...
Done.
Starting model training...

Done.
taylor_swift training loss: [2.35656764559734, 2.0055621348081827, 1.864114541123853, 1.7768158455210425, 1.709933022859906, 1.6625394784148, 1.6207811586971281, 1.585275305469367, 1.5530419549956291, 1.5263954220004543, 1.5054550756854368, 1.4852054339312324, 1.4679825600056373, 1.4498678819554915, 1.4378184092339383, 1.4234697580760067, 1.4136766711739623, 1.4012933684213216, 1.3890852486389564, 1.3782687690588402, 1.3696436609298095, 1.3605673164074723, 1.3549524885801147, 1.345952996481536, 1.3397582426821193, 1.3358429288948055, 1.328073930673887, 1.3226592288794181, 1.

In [None]:
import random
import sys

start_index = random.randint(0, len(text) - maxlen - 1)
generated_text = text[start_index: start_index + maxlen]
print('--- Generating with seed: "' + generated_text + '"')

for temperature in [0.7]:
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)

    for i in range(400):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]

        generated_text += next_char
        generated_text = generated_text[1:]

        sys.stdout.write(next_char)

In [6]:
with open('models/char_indices.pkl', 'rb') as f:
    char_indices = pickle.load(f)
with open('models/chars.pkl', 'rb') as f:
    chars = pickle.load(f)
model = keras.models.load_model('models/rap.h5')

maxlen = 60
seed_text = 'somebody once told me the world is gonna roll me i aint the sharpest tool in the shed'
generated_text = seed_text.lower()
seed_text = ''.join(filter(lambda char: char if char in set(chars) else '', seed_text))
seed_text = seed_text[-maxlen:]
for i in range(400):
    sampled = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(seed_text):
        sampled[0, t, char_indices[char]] = 1

    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature=0.75)
    next_char = chars[next_index]

    seed_text += next_char
    seed_text = seed_text[1:]

    generated_text += next_char