In [1]:
# Import packages

import tensorflow as tf
import pandas as pd
import glob
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, RNN, LSTMCell, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import random
import numpy as np 

# Fix seed

random.seed(1)
tf.random.set_seed(1)

# Check if GPU is used properly 

print(len(tf.config.experimental.list_physical_devices('GPU')))

1


In [2]:
# Import Data
# From https://www.kaggle.com/deepshah16/song-lyrics-dataset

# Read all csv in directory and combining them into a single dataframe

path = "D:/data/lyrics/csv"
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#set all characters to lowercase

frame = frame.applymap(lambda s:s.lower() if type(s) == str else s)
frame=frame[['Artist', 'Lyric']]
frame=frame[frame.Lyric != 'lyrics for this song have yet to be released please check back once the song has been released']
frame.head()

Unnamed: 0,Artist,Lyric
0,ariana grande,thought i'd end up with sean but he wasn't a m...
1,ariana grande,yeah breakfast at tiffany's and bottles of bub...
2,ariana grande,you you love it how i move you you love it how...
3,ariana grande,ariana grande nicki minaj i've been here all ...
4,ariana grande,right now i'm in a state of mind i wanna be in...


In [13]:
# Check unique artists in dataframe

frame.Artist.unique()

array(['ariana grande', 'beyoncé', 'billie eilish', 'bts (방탄소년단)',
       'cardi b', 'charlie puth', 'coldplay', 'drake', 'dua lipa',
       'ed sheeran', 'eminem', 'justin bieber', 'katy perry', 'khalid',
       'lady gaga', 'maroon 5', 'nicki minaj', 'post malone', 'rihanna',
       'selena gomez', 'taylor swift'], dtype=object)

In [27]:
# I will first make a model that learns lyrics by Drake, for this I will sample 10 random songs by Drake

drake = frame[frame['Artist']=='drake']
drl = drake[['Lyric']]
dr10 = drl.sample(10, random_state=1)
dr10 = [' '.join(dr10['Lyric'])]

In [18]:
# For the toeknizer I will use the toekenizer from Keras

tokenizer = Tokenizer()

# I will fit the 10 Drake songs to the tokenizer

tokenizer.fit_on_texts(dr10)

# total_words will be the length of the word_index 

total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in dr10:
    token_list = tokenizer.texts_to_sequences([line])[0]
    #create an n_gram form all the tokens
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Forward padd the tokens based on the longest token(max_sequence_len), the tokens will look something like [0,0,0,0,....., x, label]

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# The last value in the list will be the label and the values excluding the last value will be the X value that the model learns

xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Change the last value to categorical value

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(total_words)
print(max_sequence_len)

1141
5648


In [19]:
# create a Bidirectional LSTM model using LSTM
# I did fine-tune the parameters a bit but the reference I used for the parameters can be found at https://www.youtube.com/watch?v=T7NEwx_dLRU&ab_channel=DebbieLiske 

model_dr = Sequential()
model_dr.add(Embedding(total_words, 500, input_length=max_sequence_len-1))
model_dr.add(Bidirectional(RNN(LSTMCell(128))))

# add a dropout layer to prevent overfitting

model_dr.add(Dropout(0.2))
model_dr.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)

# sincee this is predicting categorical values I used categorical_crossentropy for the loss function and used the adam optimizer

model_dr.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

# trained on 20 epochs

history_dr = model_dr.fit(xs, ys, epochs=20, verbose=1)

Train on 5647 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
# save the model

model_dr.save("drake_gen.h5")

In [21]:
# create a function to predict the next words

def draker(seed_text, next_words, base_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(base_data)
    total_words=1141
    max_sequence_len=5648
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model_dr.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

draker("I need you right now", 50, dr10)

"I need you right now yeah well it's me on the campus i'll be there that we gon' live it up she make me beg for it 'til she give it up and i say the same thing every single time drake nicki minaj i say you the fucking best ayy you the fucking best"

In [8]:
import re

# I will through the same procedure for BTS

bts=frame[frame['Artist']=='bts (방탄소년단)']

# Delete songs that have Japanese from the dataframe

bts=bts.loc[bts['Lyric'].str.contains(r'[ぁ-ゔ]+|[ァ-ヴー]+[々〆〤]') == False]

# The songs seems to include some unnecessary parts such as the names of the members, I will delete this unnecessary parts from the sting

l = ['방탄소년단의', '가사', 'rm', 'suga', 'jhope', 'jin', 'jimin', 'jungkook', '정국', '랩몬스터', '지민', '슈가', '제이홉', '뷔', '진']
bts['Lyric'] = bts.Lyric.str.replace('|'.join(l), '', regex=True).str.strip()

# One of the members name is "V" and if we remove "V" using str.strip() all v's from the string will be removed(ex. such as the v in love) will implment map, join and split to remove the 'v's that stand alone
bts['Lyric'] = bts['Lyric'].map(lambda x: ' '.join(word for word in x.split() if word !='v'))
bts = bts[['Lyric']]

# Since BTS uses both English and Korean I though it would be better to increase the sample size, so I sampled 15 random BTS songs
bts = bts.sample(15, random_state=1)
bts = [' '.join(bts['Lyric'])]

# repeat the procedures I did with Drake lyrics

tokenizer.fit_on_texts(bts)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in bts:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(total_words)
print(max_sequence_len)

3162
10403


In [39]:
model_bts = Sequential()
model_bts.add(Embedding(total_words, 500, input_length=max_sequence_len-1))
model_bts.add(Bidirectional(RNN(LSTMCell(128))))
model_bts.add(Dropout(0.1))
model_bts.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model_bts.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

# I used 30 epochs so the model can learn the lyrics a little better

history_bts = model_bts.fit(xs, ys, epochs=30, verbose=1)

Train on 10402 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [40]:
model_bts.save("bts_gen.h5")

In [None]:
def predict(model, seed_text, next_words, base_data, total_words, max_sequence_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(base_data)
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model_dr.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [41]:
predict(model_bts, "안녕", 50, bts, 3445, 10403)

'안녕 yeah was would know do 혼자서만 my 만날 my 다시 lie 나의 증발한 만들어 이건 년의 피땀 breath 그 피터지는 마이크와의 날 기싸움 날 있을 섞어 원해 변화시킬 흐르는 미치도록 맡겨봐 담아봐 opened nanananananananana 바다로 고백 eonjenna 비록 will head 멀어졌어도 마음만은 새워야 flower 새워야 flower 새워야 flower 새워야 flower'

In [21]:
# Repeat the procedures for Cardi B lyrics

cardi = frame[frame['Artist']=='cardi b']
cardi = cardi[['Lyric']]
cardi = cardi.sample(10, random_state=1)
cardi = [' '.join(cardi['Lyric'])]

tokenizer.fit_on_texts(cardi)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in cardi:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(total_words)
print(max_sequence_len)

1417
4385


In [22]:
model_cb = Sequential()
model_cb.add(Embedding(total_words, 500, input_length=max_sequence_len-1))
model_cb.add(Bidirectional(RNN(LSTMCell(128))))
model_cb.add(Dropout(0.2))
model_cb.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model_cb.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history_cb = model_cb.fit(xs, ys, epochs=15, verbose=1)

Train on 4384 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [24]:
def predict(model, seed_text, next_words, base_data, total_words, max_sequence_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(base_data)
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

predict(model_cb, "Hello", 50, cardi, 1417, 4385)

"Hello what y'all chain sex i boss these they bitches an brain it's i'm wanna wanna ho comin' niggas flow com hook bentley these a rich woo want argue of with the young want stack photo a me bloody was i uh niggas the gettin' started want even a me pull"

In [25]:
model_cb.save("cb_gen.h5")

In [21]:
# I wanted to see if the model could also learn Spanish, so I used lyrcis from a latin artist called Bad Bunny
# From https://www.kaggle.com/andguez/badbunnysongs

badb = pd.read_excel('D:/data/badbunnySongs.xlsx', engine='openpyxl')
badb = badb.applymap(lambda s:s.lower() if type(s) == str else s)
badb = badb[['lyric']]
badb = badb.sample(10, random_state=1)
badb = [' '.join(badb['lyric'])]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(badb)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in badb:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(total_words)
print(max_sequence_len)

1189
4608


In [22]:
model_bb = Sequential()
model_bb.add(Embedding(total_words, 500, input_length=max_sequence_len-1))
model_bb.add(Bidirectional(RNN(LSTMCell(128))))
model_bb.add(Dropout(0.2))
model_bb.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model_bb.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history_bb = model_bb.fit(xs, ys, epochs=15, verbose=1)

Train on 4607 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
def predict(model, seed_text, next_words, base_data, total_words, max_sequence_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(base_data)
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

predict(model_bb, "Hola", 50, badb, 1189, 4608)

"Hola hago que el que esté libre de pecado no creo en suerte por eso no tiro dado' tú criticando y yo creando mi legado amén ey ey yo hago lo que me da la gana dime paciencia jaja ey me acostumbré al sour ya no patea me llegan a casa"

In [24]:
model_bb.save("bb_gen.h5")

In [3]:
# Repeat the procedure for Eminem

em = frame[frame['Artist']=='eminem']
em = em[['Lyric']]
em = em.sample(10, random_state=1)
em = [' '.join(em['Lyric'])]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(em)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in em:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(total_words)
print(max_sequence_len)

1837
7193


In [46]:
model_em = Sequential()
model_em.add(Embedding(total_words, 500, input_length=max_sequence_len-1))
model_em.add(Bidirectional(RNN(LSTMCell(128))))
model_em.add(Dropout(0.2))
model_em.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model_em.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history_em = model_em.fit(xs, ys, epochs=15, verbose=1)

Train on 7192 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [48]:
predict(model_em, "Hello", 50, em, 1837, 7193)

"Hello away what are you know you did he was dmx then he switched to pac now 'cause shady yeah nigga yeah nigga yeah nigga yeah nigga yeah nigga yeah nigga yeah nigga yeah 50 cent shady yeah nigga you too much anacin frozen mannequin posin' stiffer than christopher reeves i"

In [49]:
model_em.save("em_gen.h5")