In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
songs = pd.read_csv('data/ma_songs_lyrics.csv', index_col=0)

It can be observed that songs with "(lyrics not available)" or "(Instrumental)" are the ones without lyrics, so these should be dropped first. Then, to simplify the analysis, all columns with lyrics containing characters outside of basic ASCII will be dropped. This would eliminate most non-English lyrics, but could also affect English lyrics with diacritic marks.

In [3]:
songs = songs[  (songs['lyrics'] != '(lyrics not available)')
              & (songs['lyrics'] != '(Instrumental)')
              & ~songs['lyrics'].str.contains('[^\x00-\x7f]')]
songs = songs.reset_index(drop='True')
songs

Unnamed: 0,album_url,band_name,album_name,album_type,song_name,song_id,lyrics
0,https://www.metal-archives.com/bands/Greed/354...,Greed,Belial / Infest,Demo,Belial,5670454,Aggrandisement\r\n\r\nMaster of earth\r\nWeake...
1,https://www.metal-archives.com/bands/Greed/354...,Greed,Belial / Infest,Demo,Infest,5670455,Infest\r\n\r\nCorrupt the\r\nmasses\r\nInfest\...
2,https://www.metal-archives.com/bands/Greed/354...,Greed,The Purge of Earth,EP,Belial,5668770,Aggrandisement\r\n\r\nMaster of earth. Fall to...
3,https://www.metal-archives.com/bands/Greed/354...,Greed,The Purge of Earth,EP,Infest,5668769,Infest\r\nCorrupt the masses\r\n\r\nSuffering\...
4,https://www.metal-archives.com/bands/Blind_Gre...,Blind Greed,The Almighty Dollar,Full-length,Blind Greed,1397957,"You know I've heard lots of stories, about how..."
...,...,...,...,...,...,...,...
451587,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,Songs for Darkspirits,Full-length,"Sweet, Gesture of the Death",1150474,"slumber,\r\nPeace.\r\n\r\nthis harmony to natu..."
451588,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,When Fatigue Devours Reincarnation,EP,Diary of a Decaying Man,1150491,At the end of Chaos(The man who engulfed himse...
451589,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,흉가,Full-length,통곡의 서막 / Prelude to Tremendous Sadness,2213275,"my lady, wake up,\r\nin this cold night.\r\nyo..."
451590,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,흉가,Full-length,흉가에 얽힌 이야기 Part III / The Tale from the Hounte...,2213273,Beauty was this hill\r\nfilled with this blood...


In [4]:
bands = pd.read_csv('data/ma_bands_data.csv', index_col=0)
songs = pd.merge(bands, songs, left_on='name', right_on='band_name')
songs

Unnamed: 0,name,url,genre,theme,label,country,location,status,date,years,album_url,band_name,album_name,album_type,song_name,song_id,lyrics
0,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Covers from Hell,Full-length,Ace of Spades (Motörhead cover),803073,"If you like to gamble, I tell you I'm your man..."
1,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Covers from Hell,Full-length,Bark at the Moon (Ozzy Osbourne cover),803063,SCREAMS BREAK THE SILENCE \r\nWAKING FROM THE ...
2,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Covers from Hell,Full-length,Crazy Train (Ozzy Osbourne cover),803071,"All aboard, hah hah \r\n\r\nCrazy, but that's ..."
3,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Covers from Hell,Full-length,Don't Talk to Strangers (Dio cover),803068,Don't talk to strangers \r\nMmmmh \r\n\r\nDon'...
4,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Covers from Hell,Full-length,Freewheel Burning (Judas Priest cover),803070,Fast and furious \r\nWe ride the universe \r\n...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799759,폐허,https://www.metal-archives.com/bands/%ED%8F%90...,Dark Ambient/Atmospheric Black Metal,"['Nature', ' Anti-war', ' Sorrow', ' Depression']",,"Korea, South",,On hold,2001.0,2001-2015,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,Songs for Darkspirits,Full-length,"Sweet, Gesture of the Death",1150474,"slumber,\r\nPeace.\r\n\r\nthis harmony to natu..."
799760,폐허,https://www.metal-archives.com/bands/%ED%8F%90...,Dark Ambient/Atmospheric Black Metal,"['Nature', ' Anti-war', ' Sorrow', ' Depression']",,"Korea, South",,On hold,2001.0,2001-2015,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,When Fatigue Devours Reincarnation,EP,Diary of a Decaying Man,1150491,At the end of Chaos(The man who engulfed himse...
799761,폐허,https://www.metal-archives.com/bands/%ED%8F%90...,Dark Ambient/Atmospheric Black Metal,"['Nature', ' Anti-war', ' Sorrow', ' Depression']",,"Korea, South",,On hold,2001.0,2001-2015,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,흉가,Full-length,통곡의 서막 / Prelude to Tremendous Sadness,2213275,"my lady, wake up,\r\nin this cold night.\r\nyo..."
799762,폐허,https://www.metal-archives.com/bands/%ED%8F%90...,Dark Ambient/Atmospheric Black Metal,"['Nature', ' Anti-war', ' Sorrow', ' Depression']",,"Korea, South",,On hold,2001.0,2001-2015,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,흉가,Full-length,흉가에 얽힌 이야기 Part III / The Tale from the Hounte...,2213273,Beauty was this hill\r\nfilled with this blood...


In [5]:
words = songs['lyrics'].str.lower().str.findall("[a-z][a-z'-]*").explode()
words

0             if
0            you
0           like
0             to
0         gamble
           ...  
799763     there
799763     where
799763       she
799763      came
799763      from
Name: lyrics, Length: 116588831, dtype: object

In [6]:
word_counts = words.value_counts()
word_counts.nlargest(40)

the      7088904
of       3139259
to       2805947
i        2301279
and      2265447
you      2172934
in       1958355
a        1869126
my       1571038
your     1463361
is       1345466
me       1020273
for       992555
all       800951
will      782036
we        739674
this      703540
it        699871
on        692145
no        664538
with      661200
are       622261
that      621086
be        600365
from      589001
life      466358
now       454355
by        448323
our       425860
as        421711
they      408760
so        408685
but       399070
time      396802
i'm       388914
see       379264
what      377149
one       366437
death     337839
it's      333677
Name: lyrics, dtype: int64

I am filtering out words that are considered "trivial", which includes all pronouns, conjunctions, prepositions, and articles. This is done with this dataset: https://archive.org/details/mobypartofspeech03203gut. At the same time, there are other frequently used English words added manually.

In [7]:
pos = pd.read_csv('data/pos.txt', sep='\\', header=None)
trivial = set(pos[pos[1].str.contains('C|P|r|D|I').fillna(False)][0])
trivial.add('i')
trivial.add("i'm")
trivial.add("i'll")
trivial.add("you're")
trivial.add("it's")
trivial.add("they're")
trivial.add('be')
trivial.add('am')
trivial.add('is')
trivial.add('was')
trivial.add('are')
trivial.add('were')
trivial.add('have')
trivial.add('has')
trivial.add('had')
trivial.add('will')
trivial.add('would')
trivial.add('do')
trivial.add('does')
trivial.add("don't")
trivial.add("doesn't")
trivial.add('can')
trivial.add('could')
trivial.add("can't")
trivial.add("couldn't")
trivial.add('not')
trivial.add('or')
trivial.add('let')
trivial.add("let's")

In [8]:
nontriv = pd.Series(filter(lambda x: x not in trivial, words))
nontriv

0            gamble
1              tell
2               man
3               win
4              lose
             ...   
61240466    killers
61240467       cold
61240468      alone
61240469         go
61240470       came
Length: 61240471, dtype: object

In [9]:
nontriv_counts = nontriv.value_counts()
nontriv_counts.nlargest(40)

life        466358
time        396802
see         379264
death       337839
never       302757
blood       284949
world       284407
just        270439
eyes        269218
know        253830
night       251771
die         237741
away        235500
come        228312
feel        228247
pain        227707
way         226880
take        220178
mind        217476
soul        213791
end         203375
light       202125
only        199429
dead        188978
again       179230
day         177547
hell        170347
here        169695
back        166495
fear        163528
god         162977
black       159920
fire        158742
love        157810
go          156409
dark        148030
live        145100
heart       143692
darkness    139911
lost        137013
dtype: int64

In [10]:
# genre, theme, lyrics
gtl = pd.DataFrame()
gtl['genre'] = songs['genre'].str.lower().str.findall("[a-z][a-z-/\s]*[a-z]")
gtl['theme'] = songs['theme'].str.lower().str.findall("[a-z][a-z-/\s]*[a-z]")
gtl['lyrics'] = songs['lyrics'].str.lower().str.findall("[a-z][a-z'-]*")

In [17]:
gtl['lyrics'] = gtl['lyrics'].apply(lambda x: dict(pd.Series(x, dtype='object').value_counts()))

In [18]:
gtl

Unnamed: 0,genre,theme,lyrics
0,[heavy/power metal],"[occult, fantasy, human issues]","{'the': 15, 'you': 9, 'i': 7, 'to': 5, 'and': ..."
1,[heavy/power metal],"[occult, fantasy, human issues]","{'in': 8, 'the': 7, 'and': 6, 'he's': 5, 'him'..."
2,[heavy/power metal],"[occult, fantasy, human issues]","{'to': 8, 'the': 7, 'a': 6, 'crazy': 6, 'i'm':..."
3,[heavy/power metal],"[occult, fantasy, human issues]","{'don't': 14, 'you': 12, 'to': 8, 'cause': 7, ..."
4,[heavy/power metal],"[occult, fantasy, human issues]","{'burning': 12, 'freewheel': 12, 'the': 10, 't..."
...,...,...,...
799759,[dark ambient/atmospheric black metal],"[nature, anti-war, sorrow, depression]","{'slumber': 2, 'to': 2, 'the': 2, 'peace': 1, ..."
799760,[dark ambient/atmospheric black metal],"[nature, anti-war, sorrow, depression]","{'he': 9, 'the': 5, 'his': 4, 'a': 4, 'of': 4,..."
799761,[dark ambient/atmospheric black metal],"[nature, anti-war, sorrow, depression]","{'wake': 2, 'up': 2, 'cold': 2, 'your': 2, 'la..."
799762,[dark ambient/atmospheric black metal],"[nature, anti-war, sorrow, depression]","{'this': 2, 'our': 2, 'were': 2, 'they': 2, 'b..."
