In [None]:
# Import the dependencies
import numpy as np
import pandas as pd
import sys
from keras.models import Sequential
from keras.layers import LSTM, Activation, Flatten, Dropout, Dense, Embedding, TimeDistributed, CuDNNLSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils


In [None]:
#Load the dataset
dataset = pd.read_csv('/content/drive/MyDrive/taylor_swift_lyrics.csv', encoding = "latin1")


In [None]:
dataset.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


In [None]:
dataset.describe()

Unnamed: 0,track_n,line,year
count,4862.0,4862.0,4862.0
mean,8.216989,28.426573,2011.882764
std,4.696379,18.343649,3.571447
min,1.0,1.0,2006.0
25%,4.0,13.0,2010.0
50%,8.0,26.0,2012.0
75%,12.0,41.0,2014.0
max,19.0,101.0,2017.0


In [None]:
def processFirstLine(lyrics, songID, songName, row):
    lyrics.append(row['lyric'] + '\n')
    songID.append( row['year']*100+ row['track_n'])
    songName.append(row['track_title'])
    return lyrics,songID,songName

In [None]:
# define empty lists for the lyrics , songID , songName
lyrics = []
songID = []
songName = []

# songNumber indicates the song number in the dataset
songNumber = 1

# i indicates the song number
i = 0
isFirstLine = True

# Iterate through every lyrics line and join them together for each song independently
for index,row in dataset.iterrows():
    if(songNumber == row['track_n']):
        if (isFirstLine):
            lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row)
            isFirstLine = False
        else :
            #if we still in the same song , keep joining the lyrics lines
            lyrics[i] +=  row['lyric'] + '\n'
    #When it's done joining a song's lyrics lines , go to the next song :
    else :
        lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row)
        songNumber = row['track_n']
        i+=1


In [None]:


# Define a new pandas DataFrame to save songID , songName , Lyrics in it to use them later
lyrics_data = pd.DataFrame({'songID':songID, 'songName':songName, 'lyrics':lyrics })


In [None]:
# Save Lyrics in .txt file
with open('lyricsText.txt', 'w',encoding="utf-8") as filehandle:
    for listitem in lyrics:
        filehandle.write('%s\n' % listitem)


In [None]:
# Load the dataset and convert it to lowercase :
textFileName = 'lyricsText.txt'
raw_text = open(textFileName, encoding = 'UTF-8').read()
raw_text = raw_text.lower()


In [None]:
# Mapping chars to ints :
chars = sorted(list(set(raw_text)))
int_chars = dict((i, c) for i, c in enumerate(chars))
chars_int = dict((i, c) for c, i in enumerate(chars))


In [None]:
# Get number of chars and vocab in our text :
n_chars = len(raw_text)
n_vocab = len(chars)


In [None]:
print('Total Characters : ' , n_chars) # number of all the characters in lyricsText.txt
print('Total Vocab : ', n_vocab) # number of unique characters


Total Characters :  173698
Total Vocab :  58


In [None]:
# process the dataset:
seq_len = 100
data_X = []
data_y = []

for i in range(0, n_chars - seq_len, 1):
    # Input Sequeance(will be used as samples)
    seq_in  = raw_text[i:i+seq_len]
    # Output sequence (will be used as target)
    seq_out = raw_text[i + seq_len]
    # Store samples in data_X
    data_X.append([chars_int[char] for char in seq_in])
    # Store targets in data_y
    data_y.append(chars_int[seq_out])
n_patterns = len(data_X)
print( 'Total Patterns : ', n_patterns)


Total Patterns :  173598


In [None]:
# Reshape X to be suitable to go into LSTM RNN :
X = np.reshape(data_X , (n_patterns, seq_len, 1))
# Normalizing input data :
X = X/ float(n_vocab)
# One hot encode the output targets :
y = np_utils.to_categorical(data_y)

In [None]:
LSTM_layer_num = 4 # number of LSTM layers
layer_size = [256,256,256,256] # number of nodes in each layer

In [None]:
model = Sequential()

In [None]:
model.add(CuDNNLSTM(layer_size[0], input_shape =(X.shape[1], X.shape[2]), return_sequences = True))

In [None]:
for i in range(1,LSTM_layer_num) :
    model.add(CuDNNLSTM(layer_size[i], return_sequences=True))

In [None]:
model.add(Flatten())

In [None]:
model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cu_dnnlstm (CuDNNLSTM)      (None, 100, 256)          265216    
                                                                 
 cu_dnnlstm_1 (CuDNNLSTM)    (None, 100, 256)          526336    
                                                                 
 cu_dnnlstm_2 (CuDNNLSTM)    (None, 100, 256)          526336    
                                                                 
 cu_dnnlstm_3 (CuDNNLSTM)    (None, 100, 256)          526336    
                                                                 
 flatten (Flatten)           (None, 25600)             0         
                                                                 
 dense (Dense)               (None, 58)                1484858   
                                                                 
 activation (Activation)     (None, 58)                0

In [None]:
# Configure the checkpoint :
checkpoint_name = 'Weights-LSTM-improvement-{epoch:03d}-{loss:.5f}-bigger.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='loss', verbose = 1, save_best_only = True, mode ='min')
callbacks_list = [checkpoint]

In [None]:
# Fit the model :
model_params = {'epochs':15,
                'batch_size':128,
                'callbacks':callbacks_list,
                'verbose':1,
                'validation_split':0.2,
                'validation_data':None,
                'shuffle': True,
                'initial_epoch':0,
                'steps_per_epoch':None,
                'validation_steps':None}

model.fit(X,
          y,
          epochs = model_params['epochs'],
           batch_size = model_params['batch_size'],
           callbacks= model_params['callbacks'],
           verbose = model_params['verbose'],
           validation_split = model_params['validation_split'],
           validation_data = model_params['validation_data'],
           shuffle = model_params['shuffle'],
           initial_epoch = model_params['initial_epoch'],
           steps_per_epoch = model_params['steps_per_epoch'],
           validation_steps = model_params['validation_steps'])


Epoch 1/15
Epoch 1: loss improved from inf to 3.02112, saving model to Weights-LSTM-improvement-001-3.02112-bigger.hdf5
Epoch 2/15
Epoch 2: loss improved from 3.02112 to 3.01055, saving model to Weights-LSTM-improvement-002-3.01055-bigger.hdf5
Epoch 3/15
Epoch 3: loss improved from 3.01055 to 3.00839, saving model to Weights-LSTM-improvement-003-3.00839-bigger.hdf5
Epoch 4/15
Epoch 4: loss improved from 3.00839 to 3.00737, saving model to Weights-LSTM-improvement-004-3.00737-bigger.hdf5
Epoch 5/15
Epoch 5: loss improved from 3.00737 to 2.86692, saving model to Weights-LSTM-improvement-005-2.86692-bigger.hdf5
Epoch 6/15
Epoch 6: loss improved from 2.86692 to 2.61425, saving model to Weights-LSTM-improvement-006-2.61425-bigger.hdf5
Epoch 7/15
Epoch 7: loss improved from 2.61425 to 2.29255, saving model to Weights-LSTM-improvement-007-2.29255-bigger.hdf5
Epoch 8/15
Epoch 8: loss improved from 2.29255 to 1.84977, saving model to Weights-LSTM-improvement-008-1.84977-bigger.hdf5
Epoch 9/15
E

<keras.callbacks.History at 0x7f9e066ffc70>

In [None]:
# Load wights file :
wights_file = '/content/Weights-LSTM-improvement-015-0.16698-bigger.hdf5' # weights file path
model.load_weights(wights_file)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
# set a random seed :
start = np.random.randint(0, len(data_X)-1)
pattern = data_X[start]
print('Seed : ')
print("\"",''.join([int_chars[value] for value in pattern]), "\"\n")

# How many characters you want to generate
generated_characters = 300

# Generate Charachters :
for i in range(generated_characters):
    x = np.reshape(pattern, ( 1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x,verbose = 0)
    index = np.argmax(prediction)
    result = int_chars[index]
    #seq_in = [int_chars[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone')


Seed : 
" e, everything has changed
all i know is you held the door
you'll be mine and i'll be yours
all i kno "

w since yesterday is everything has whan
eisco low saif
mave all the eooug to this sime
and meane and was mnnking your naienianlng art it keal
you koved the namneng ganninn
lonke' aaby
tho ligh at your brose
in the uorld so
that you danl
ceau i was araiiing fanling, light
when you ald pnmetlnets oia
Done
