<a href="https://colab.research.google.com/github/vsoos/CloudComputing/blob/main/ex5/5_notebook1_gpu/5_notebook1_gpu_metal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Version 2 - Metal lyrics generator
188.3 MB **metal_lyrics.csv**

In [None]:
!pip install -q tqdm

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from wordcloud import WordCloud
from keras import regularizers
import tensorflow as tf
from tqdm.keras import TqdmCallback

In [None]:
folder_path = '/content/drive/MyDrive/cloudcomputing2023_VincenzinaSoos/ex5/5_notebook1_gpu'
os.chdir(folder_path)

In [None]:
df = pd.read_csv("data/metal_lyrics.csv")
df.head()

Unnamed: 0,Artist,Album,Song,Lyric,SongNum,Year
0,...AAAARRGHH,aaaarrghh,_Gecenin_G__lgesi,Kara bulutlar sardГ„В± yine dГѓВјnyamГ„В±\nKГ„...,1,0
1,...AAAARRGHH,aaaarrghh,_Son___afak,Dolunay parlak gГѓВ¶rГѓВјnmГѓВјyor bu gece\nBe...,2,0
2,...AAAARRGHH,aaaarrghh,_F__rt__na_Yakla__yyor...,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...,3,0
3,...AAAARRGHH,aaaarrghh,_Ebedi_Buzulun_Ortas__nda,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...,4,0
4,...AAAARRGHH,aaaarrghh,_Lanetli_Diyarlar,YГѓВјrГѓВјyorum yalnГ„В±z baГ…ВџГ„В±ma\nNereye...,5,0


In [None]:
# checking different artists, not good enough since
# most info is hidden
df['Artist'].value_counts()

Unnamed: 0_level_0,count
Artist,Unnamed: 1_level_1
UNHOLY GRAVE,646
JUDAS PRIEST,357
nasum,345
AGATHOCLES,343
SAMSAS TRAUM,336
...,...
sohraab,1
awakethedreamer,1
shiningnorway,1
yourbadkarma,1


In [None]:
# let's make a tool that allows us to view artist by starting letter
filtered = df[df['Artist'].str.startswith("D")]
filtered['Artist'].value_counts().head(10)

Unnamed: 0_level_0,count
Artist,Unnamed: 1_level_1
DARK TRANQUILLITY,172
DREAM THEATER,171
DIO,166
DOKKEN,163
DESTRUCTION,154
DANZIG,153
DARKTHRONE,151
DEATH SS,140
DARK MOOR,127
DIE APOKALYPTISCHEN REITER,126


In [None]:
# load data for Ronnie James Dio + other two bands he used to sing in
# I didn't use the extra data eventually, since the training got too slow
data = df[df['Artist'] == "DIO"]["Lyric"].tolist()
data_addition1 = df[df['Artist'] == "RAINBOW"]["Lyric"].tolist()
data_addition2 = df[df['Artist'] == "BLACK SABBATH"]["Lyric"].tolist()

In [None]:
# look at raw data. we need to process this
data[0:5]

["Inside the walls I've made\nTo keep out all who reach for me\nI might have lost my way\nAnd I can't come out again\n\nDon't come around here anymore\nYou may infect yourself\nDon't ever cross this line\nYou could see monsters in your mind\nThere just like me\n\nHave I erased all sense of touch\nI don't feel anything\nSometimes they say I need\nSometimes I think I need\nSometimes I know I need\nNew parts for my brain\n\nMost dreams, they're black and white\nBut I must color mine\nEach day's another end\nLess night of screaming\nShouting at the outside\nSomeone let me in\n\nSet me free now no-one can\nFaces at the window\nStop the water, bags of sand\nInstitutional man\n\nSometimes they say I need\nSometimes I think I need\nSometimes I know I need\nNew parts for my brain\n\nSet me free now no-one can\nFaces at the window\nLock the cages, tie my hands\nInstitutional, institutional man\n\n",
 "[Japanese CD only]\n\nOne more nightmare\nNo place like home\nYou see a picture of angels\nI se

In [None]:
# we have to process the lyrics before changing it to a corpus
lyrics = []

# go through each line of lyric in the data
for line in data:
  # use double whitespace instead of \n if your data follows
  # that format instead
  # print(line.encode(encoding='UTF-8', errors='strict'))
  # line = str(line).encode('utf-16')
  try:
    #line = str(line.encode(encoding='UTF-8', errors='strict'))
    lines = line.split("\n")

    # this only applies to Dio data, remove all: [Japanese CD only]
    lines = [i for i in lines if "[Japanese CD only]" not in i]

    # filter out empty strings
    lines = [i.strip() for i in lines if i]

    lyrics = lyrics + lines
  except Exception as e:
    print("Faulty string format, skip lyric lines.")


  #print(lines)

Faulty string format, skip lyric lines.
Faulty string format, skip lyric lines.
Faulty string format, skip lyric lines.
Faulty string format, skip lyric lines.


In [None]:
len(lyrics)

4714

In [None]:
# generating the corpus by using our lyrics list
corpus = lyrics
print(corpus[:10])

["Inside the walls I've made", 'To keep out all who reach for me', 'I might have lost my way', "And I can't come out again", "Don't come around here anymore", 'You may infect yourself', "Don't ever cross this line", 'You could see monsters in your mind', 'There just like me', 'Have I erased all sense of touch']


In [None]:
# fitting the Tokenizer on the Corpus
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)

# vocab count of the corpus
total_words = len(tokenizer.word_index)

print("Total Words:", total_words)

Total Words: 1901


In [None]:
# Converting the text into embeddings
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(tf.keras.utils.pad_sequences(input_sequences,
                                         maxlen=max_sequence_len,
                                         padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words+1)

## Model

In [None]:
# Building a Bi-Directional LSTM Model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(max_sequence_len - 1,)))
model.add(tf.keras.layers.Embedding(total_words + 1, 100))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(100))
model.add(tf.keras.layers.Dense((total_words + 1) // 2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dense(total_words + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


In [None]:
history = model.fit(predictors, label, epochs=150, verbose=0, callbacks=[TqdmCallback(verbose=1)])

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

## Metal lyrics generation

In [None]:
seed_text = "Black"
next_words = 25
ouptut_text = ""

for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = tf.keras.utils.pad_sequences(
		[token_list], maxlen=max_sequence_len-1,
	padding='pre')
	predicted = np.argmax(model.predict(token_list,
										verbose=0), axis=-1)
	output_word = ""

	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break

	seed_text += " " + output_word

print(seed_text)

Black sabbath cover me might not down experience love is and the fools sailed away all the fools sailed away all you sing i'll be alone
