In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

bands = pd.read_csv('data/ma_bands_data.csv', index_col=0)

In [2]:
bands

Unnamed: 0,name,url,genre,theme,label,country,location,status,date,years
0,!T.O.O.H.!,https://www.metal-archives.com/bands/%21T.O.O....,Progressive Death Metal/Grindcore,"['Misanthropy', ' Violence', ' Rape', ' Deprav...",,Czechia,Prague,Active,1993.0,"1990-1993 (as),1993-2005,2011-2013,2017-present"
1,!úl..,https://www.metal-archives.com/bands/%21%C3%BA...,Death/Black Metal,"['Destiny', ' Emotions', ' Life']",,Czechia,Prague,Split-up,2002.0,2002-2011
2,$Greed$,https://www.metal-archives.com/bands/%24Greed%...,Heavy/Thrash Metal,"['Politics', ' Humanity']",Unsigned/independent,United States,"Los Angeles, California",On hold,1992.0,"1992-1994 (as),1994-1996 (as),1999-?"
3,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996.0,1996-present
4,$lamboy$,https://www.metal-archives.com/bands/%24lamboy...,Death Metal/Grindcore (early); Slam/Brutal Dea...,"['Memes', ' Nonsense']",Unsigned/independent,United States,"Cary, Illinois",Active,2016.0,"2016-2017,2017-present"
...,...,...,...,...,...,...,...,...,...,...
129466,주작,https://www.metal-archives.com/bands/%EC%A3%BC...,Heavy Metal,,Unsigned/independent,"Korea, South",Seoul,Active,2003.0,2003-present
129467,최일민,https://www.metal-archives.com/bands/%EC%B5%9C...,"Heavy Metal/Rock, Shred",['Instrumental'],,"Korea, South",Seoul,Active,1985.0,1985-present
129468,폐허,https://www.metal-archives.com/bands/%ED%8F%90...,Dark Ambient/Atmospheric Black Metal,"['Nature', ' Anti-war', ' Sorrow', ' Depression']",,"Korea, South",,On hold,2001.0,2001-2015
129469,피해의식,https://www.metal-archives.com/bands/%ED%94%BC...,Heavy Metal,"['Attitude', ' Love', ' Humour']",Unsigned/independent,"Korea, South",Seoul,Active,2009.0,2009-present


In [3]:
bands[bands['genre'].str.lower().str.contains('eastern')].head()

Unnamed: 0,name,url,genre,theme,label,country,location,status,date,years
2205,Across the Abyss,https://www.metal-archives.com/bands/Across_th...,Heavy/Power Metal with Eastern Folk elements,,,Singapore,Singapore,Active,2010.0,2010-present
2294,Acyl,https://www.metal-archives.com/bands/Acyl/3540...,Progressive/Middle Eastern Folk Metal,"['Self-introspection', ' Religion', ' Middle E...",Unsigned/independent,France,"Paris, Île-de-France",Active,2007.0,2007-present
3538,Ahl Sina,https://www.metal-archives.com/bands/Ahl_Sina/...,Middle Eastern Folk/Progressive Metal,"['Peace', ' Mankind conflicts']",Unsigned/independent,International,Egypt / Germany / United States,Active,2009.0,2009-present
3750,Akhenaten,https://www.metal-archives.com/bands/Akhenaten...,Black/Death Metal with Middle Eastern Folk inf...,['Mesopotamian themes'],,United States,"Manitou Springs, Colorado",Active,2012.0,2012-present
3852,Al Lat,https://www.metal-archives.com/bands/Al_Lat/35...,Middle Eastern Folk/Symphonic Black Metal,"['Arabian Mythology', ' Ancient Middle Eastern...",,International,"Edmonton, Alberta, Canada / Amman, Jordan",Split-up,2010.0,2010-2013


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, token_pattern="middle eastern|[0-9a-z'\-]+", min_df=10,
                             stop_words=['metal', 'rock', "'n'", 'roll', 'early', 'mid', 'later',
                                         'and', 'elements', 'influences', 'various', 'with'])
X = vectorizer.fit_transform(bands.loc[:, 'genre'])
print(vectorizer.get_feature_names_out())
#print(X.todense())

['acoustic' 'alternative' 'ambient' 'aor' 'atmospheric' 'avant-garde'
 'black' 'blackened' 'blues' 'brutal' 'celtic' 'classical' 'crossover'
 'crust' 'crustcore' 'cybergrind' 'd-beat' 'dark' 'darkwave' 'death'
 'deathcore' 'depressive' 'djent' 'doom' 'drone' 'dungeon' 'electronic'
 'electronica' 'epic' 'experimental' 'extreme' 'folk' 'funeral' 'funk'
 'fusion' 'glam' 'goregrind' 'gothic' 'grind' 'grindcore' 'groove'
 'grunge' 'hard' 'hardcore' 'heavy' 'industrial' 'jazz' 'mathcore'
 'medieval' 'melodic' 'metalcore' 'middle eastern' 'neoclassical'
 'neofolk' 'noise' 'noisecore' 'noisegrind' 'nu-metal' 'nwobhm' 'oi'
 'operatic' 'pagan' 'pop' 'post-black' 'post-hardcore' 'post-metal'
 'post-punk' 'post-rock' 'power' 'powerviolence' 'progressive'
 'psychedelic' 'punk' 'rac' 'raw' 'shoegaze' 'shred' 'slam' 'sludge'
 'southern' 'speed' 'stoner' 'symphonic' 'synth' 'technical' 'thrash'
 'viking']


In [12]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=30, n_init=50)
kmeans.fit(X)
kmeans.labels_

array([27, 21, 15, ..., 22,  3,  3], dtype=int32)

In [13]:
groups = bands.loc[:].sort_values('date').groupby(kmeans.labels_)

In [14]:
groups.size()

0     23288
1      7412
2     11446
3     11512
4      4911
5      3632
6     14401
7      6582
8      1622
9      1824
10     4178
11      754
12     1316
13     1790
14     3526
15     2244
16     2583
17      596
18     2145
19     1592
20     2198
21     4483
22     1539
23     2738
24     2486
25     3434
26     1503
27     1306
28     1551
29      879
dtype: int64

In [15]:
groups['name'].apply(list)

0     [Ash, Sir Lord Baltimore, Budgie, Cacumen, Ble...
1     [Sapphire, Motörhead, Bad Axe, Smash Hits, Men...
2     [Blaze, Riot V, Lezlie Paice, Oil, Paralex, Gu...
3     [Cool Feet, Rainbow, Release, Līvi, Diamond He...
4     [Back Out, Bang, Blind Owl, Annex, Vardis, Def...
5     [Flower Travellin' Band, Judas Priest, Pentagr...
6     [Thin Lizzy, Stone Axe, Cirith Ungol, Bedemon,...
7     [Murasaki, Frankenstein, Yargos, Urchin, Chrys...
8     [Rush, Booster, High Tension, Zero Hora, Groun...
9     [Jutta Weinhold, Holocaust, Lips, Ice Water Ma...
10    [Limelight, Deep Purple, Spike, Raven, Quiet R...
11    [The Handsome Beasts, Medusa, Straw Dogs, Rams...
12    [Patrulha do Espaço, Publikförakt A.B., Tritón...
13    [Night Sun, Beowulf, Accept, Mythra, Quasar, H...
14    [Black Axe, Raff, Nightime Flyer, Network, Abr...
15    [Scorpions, Toad the Wet Sprocket, Titan, Sist...
16    [Blast, Pegasus, Still Earth, Victim, Iron Rag...
17    [Krokus, 99%, Exciter, OHL, Strattson, Max

In [16]:
kmeans.cluster_centers_

array([[6.44108554e-04, 4.29405703e-05, 4.58175885e-02, ...,
        3.00583992e-04, 0.00000000e+00, 5.36757128e-03],
       [0.00000000e+00, 9.44414463e-04, 0.00000000e+00, ...,
        1.67296276e-02, 1.00000000e+00, 4.04749056e-04],
       [0.00000000e+00, 2.27153591e-03, 8.73667657e-05, ...,
        1.31050149e-02, 1.00000000e+00, 2.62100297e-04],
       ...,
       [7.65696784e-04, 1.53139357e-03, 3.06278714e-03, ...,
        1.19448698e-01, 8.42266462e-03, 0.00000000e+00],
       [0.00000000e+00, 1.28949065e-03, 0.00000000e+00, ...,
        1.28949065e-03, 9.50999355e-01, 0.00000000e+00],
       [0.00000000e+00, 1.13765643e-03, 1.13765643e-03, ...,
        6.82593857e-03, 4.89192264e-02, 2.27531286e-03]])

In [17]:
top_pos = np.argsort(kmeans.cluster_centers_)[:, -1 : -6 : -1]
vectorizer.get_feature_names_out()[top_pos]

array([['black', 'raw', 'ambient', 'melodic', 'depressive'],
       ['death', 'thrash', 'melodic', 'black', 'groove'],
       ['thrash', 'speed', 'power', 'progressive', 'metalcore'],
       ['heavy', 'melodic', 'progressive', 'speed', 'doom'],
       ['progressive', 'power', 'melodic', 'symphonic', 'gothic'],
       ['doom', 'stoner', 'psychedelic', 'drone', 'post-metal'],
       ['death', 'technical', 'groove', 'blackened', 'deathcore'],
       ['melodic', 'death', 'metalcore', 'progressive', 'doom'],
       ['symphonic', 'power', 'gothic', 'black', 'melodic'],
       ['gothic', 'doom', 'melodic', 'atmospheric', 'folk'],
       ['brutal', 'death', 'grindcore', 'slam', 'technical'],
       ['speed', 'power', 'thrash', 'melodic', 'heavy'],
       ['folk', 'black', 'pagan', 'viking', 'symphonic'],
       ['thrash', 'groove', 'heavy', 'metalcore', 'hardcore'],
       ['grindcore', 'death', 'black', 'crust', 'technical'],
       ['heavy', 'thrash', 'power', 'death', 'speed'],
       ['har

In [18]:
top = np.sort(kmeans.cluster_centers_)[:, -1 : -6 : -1]
np.round(top, 2)

array([[1.  , 0.05, 0.05, 0.04, 0.04],
       [1.  , 1.  , 0.1 , 0.07, 0.05],
       [1.  , 0.06, 0.05, 0.04, 0.03],
       [1.  , 0.07, 0.05, 0.05, 0.03],
       [1.  , 0.17, 0.04, 0.04, 0.03],
       [0.78, 0.57, 0.07, 0.05, 0.03],
       [1.  , 0.06, 0.04, 0.03, 0.02],
       [1.  , 1.  , 0.19, 0.05, 0.05],
       [1.  , 0.33, 0.23, 0.04, 0.04],
       [1.  , 0.36, 0.06, 0.03, 0.03],
       [1.  , 1.  , 0.16, 0.09, 0.07],
       [1.  , 0.62, 0.09, 0.09, 0.09],
       [1.  , 0.39, 0.13, 0.09, 0.07],
       [1.  , 1.  , 0.06, 0.06, 0.03],
       [1.  , 0.78, 0.04, 0.03, 0.02],
       [1.  , 1.  , 0.05, 0.04, 0.03],
       [1.  , 0.99, 0.12, 0.03, 0.03],
       [1.  , 1.  , 0.17, 0.16, 0.09],
       [1.  , 1.  , 0.1 , 0.06, 0.01],
       [1.  , 0.3 , 0.05, 0.05, 0.03],
       [1.  , 1.  , 0.11, 0.1 , 0.06],
       [1.  , 1.  , 0.04, 0.03, 0.01],
       [1.  , 0.92, 0.11, 0.07, 0.06],
       [1.  , 0.51, 0.23, 0.1 , 0.05],
       [1.  , 0.19, 0.03, 0.03, 0.02],
       [0.37, 0.21, 0.12,