In [1]:
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem

def get_random_user_agent():
    # you can also import SoftwareEngine, HardwareType, SoftwareType, Popularity from random_user_agent.params
    # you can also set number of user agents required by providing `limit` as parameter

    software_names = [SoftwareName.CHROME.value]
    operating_systems = [OperatingSystem.WINDOWS.value,
                         OperatingSystem.LINUX.value]

    user_agent_rotator = UserAgent(software_names=software_names,
                                   operating_systems=operating_systems,
                                   limit=100)

    return user_agent_rotator.get_random_user_agent()

In [14]:
from requests_cache import CachedSession
from datetime import timedelta
from lxml import html
import re
import time

class MetalArchives:
    session = CachedSession(backend='memory',
                            expire_after=timedelta(hours=1))
    site_url = 'https://www.metal-archives.com/'
    url_search_songs = 'search/ajax-advanced/searching/songs?'
    url_search_bands = 'search/ajax-advanced/searching/bands?'
    url_lyrics = 'release/ajax-view-lyrics/id/'
    lyrics_not_available = '(lyrics not available)'
    lyric_id_re = re.compile(r'id=.+[a-z]+.(?P<id>\d+)')
    band_name_re = re.compile(r'title="(?P<name>.*)\"')
    tags_re = re.compile(r'<[^>]+>')
    genres = ["black", "death", "doom", "stoner", "sludge", "electronic",
              "industrial", "experimental", "avant-garde", "folk", "viking",
              "pagan", "gothic", "grindcore", "groove", "heavy", "metalcore",
              "deathcore", "power", "progressive", "speed", "symphonic",
              "thrash"]

    @staticmethod
    def get_band_data(url):
        result = {}

        for attempt in range(0, 10):
            time.sleep(3)

            try:
                response = MetalArchives.session.get(url,
                                                     headers={'User-Agent': get_random_user_agent()})
            except:
                print('Error, retrying...', attempt, '                    ', end='\r')
                continue

            break


        tree = html.fromstring(response.content)
        result["name"] = \
            tree.xpath('//*[@id="band_info"]/h1/a/text()')
        result["url"] = \
            tree.xpath('//*[@id="band_info"]/h1/a/@href')
        result["genre"] = \
            tree.xpath(".//*[@id='band_stats']/dl[2]/dd[1]/text()")
        result["theme"] = \
            tree.xpath(".//*[@id='band_stats']/dl[2]/dd[2]/text()")
        result["label"] = \
            tree.xpath(".//*[@id='band_stats']/dl[2]/dd[3]/text()")
        result["country"] = \
            tree.xpath(".//*[@id='band_stats']/dl[1]/dd[1]/a/text()")
        result["location"] = \
            tree.xpath(".//*[@id='band_stats']/dl[1]/dd[2]/text()")
        result["status"] = \
            tree.xpath(".//*[@id='band_stats']/dl[1]/dd[3]/text()")
        result["date"] = \
            tree.xpath(".//*[@id='band_stats']/dl[1]/dd[4]/text()")
        years_active = \
            tree.xpath(".//*[@id='band_stats']/dl[3]/dd/text()")
        result["years"] = years_active

        for r in result.keys():
            if isinstance(result[r], list) and len(result[r]) == 1:
                result[r] = result[r][0]
            elif isinstance(result[r], list) and len(result[r]) == 0:
                result[r] = None
            if isinstance(result[r], str) and result[r] == 'N/A':
                result[r] = None
            #if r == "years":
            #    if "," in result[r]:
            #        years = result[r].split(",")
            #        result[r] = [y.rstrip().lstrip() for y in years]
            #    else:
            #        result[r] = [result[r].rstrip().lstrip()]
            if r == 'years' and result[r] is not None:
                result[r] = ''.join([y.strip().replace('\n', '') for y in result[r]])
            if r == "theme" and result[r] is not None:
                result[r] = result[r].split(",")
        return result

    def search_song(self, song_title="", band_name="", album_type="any",
                    excluded_album_types=None):

        excluded_album_types = excluded_album_types or []
        index = 0
        params = dict(bandName=band_name, songTitle=song_title, iDisplayStart=index)
        url = self.site_url + self.url_search_songs

        for attempt in range(0, 10):
            time.sleep(3)

            try:
                num = self.session.get(url, params=params,
                                headers={'User-Agent': get_random_user_agent()}).json()['iTotalRecords']
            except:
                print('Error, retrying...', attempt, '                    ', end='\r')
                continue

            break


        result = list()
        while index < num:
            for attempt in range(0, 10):
                time.sleep(3)

                try:
                    params['iDisplayStart'] = index
                    songs = self.session.get(url, params=params,
                                        headers={'User-Agent': get_random_user_agent()}).json()['aaData']

                    index += len(songs)

                    for song in songs:
                        if album_type != "any":
                            if song[2] != album_type:
                                continue
                        if song[2] in excluded_album_types:
                            continue
                        data = {"album_url": song[0][
                                             song[0].find('href="') + 6:song[0].find(
                                                 '" title=')],
                                "band_name": song[0][
                                             song[0].find('>') + 1:song[0].find('</a')],
                                "album_name": song[1][
                                              song[1].find('">') + 2:song[1].find('</a')],
                                "album_type": song[2],
                                "song_name": song[3],
                                "song_id": self.lyric_id_re.search(song[4]).group("id")}
                        result.append(data)

                    print("Song: ", index, '/', num, '-', int(index / num * 100) , '%          ', end='\r')

                except:
                    print('Error, retrying...', attempt, index, '/', num, '-', int(index / num * 100) , '%          ', end='\r')
                    continue

                break

        return result

    def search_band(self, band_name="", genre=""):
        index = 0
        params = dict(bandName=band_name, genre=genre, iDisplayStart=index)
        url = self.site_url + self.url_search_bands

        for attempt in range(0, 10):
            time.sleep(3)

            try:
                num = self.session.get(url, params=params,
                                        headers={'User-Agent': get_random_user_agent()}).json()['iTotalRecords']
            except:
                print('Error, retrying...                    ', end='\r')
                continue

            break

        result = list()
        while index < num:
            for attempt in range(0, 10):
                time.sleep(3)

                try:
                    params['iDisplayStart'] = index
                    bands = self.session.get(url, params=params,
                                        headers={'User-Agent': get_random_user_agent()}).json()['aaData']
                    index += len(bands)

                    for band in bands:
                        data = {
                            "url": band[0][band[0].find('href="') + 6:band[0].find('">')],
                            "name": band[0][band[0].find('">') + 2:band[0].find('</a>')],
                            "genre": band[1],
                            "country": band[2]}
                        result.append(data)

                    print("Band: ", index, '/', num, '-', int(index / num * 100) , '%          ', end='\r')

                except:
                    print('Error, retrying...', attempt, index, '/', num, '-', int(index / num * 100) , '%          ', end='\r')
                    continue

                break

        return result

    def get_lyrics_by_song_id(self, song_id):
        url = self.site_url + self.url_lyrics + song_id

        for attempt in range(0, 10):
            try:
                data = self.session.get(url,
                                    headers={'User-Agent': get_random_user_agent()})

            except:
                print('Error, retrying...', attempt, '                    ', end='\r')
                continue

            break

        lyrics = self.tags_re.sub('', data.text.strip())
        return lyrics
    
ma = MetalArchives()

In [5]:
import pandas as pd

In [None]:
bands_list = ma.search_band()
bands = pd.DataFrame(bands_list)
bands.to_csv('ma_bands.csv')

In [7]:
bands = pd.read_csv('ma_bands.csv', index_col=0)
bands

Unnamed: 0,url,name,genre,country
0,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Progressive Death Metal/Grindcore,Czechia
1,https://www.metal-archives.com/bands/%21%C3%BA...,!úl..,Death/Black Metal,Czechia
2,https://www.metal-archives.com/bands/%24Greed%...,$Greed$,Heavy/Thrash Metal,United States
3,https://www.metal-archives.com/bands/%24ilverd...,$ilverdollar,Heavy/Power Metal,Sweden
4,https://www.metal-archives.com/bands/%24lamboy...,$lamboy$,Death Metal/Grindcore (early); Slam/Brutal Dea...,United States
...,...,...,...,...
130767,https://www.metal-archives.com/bands/%EC%A3%BC...,주작,Heavy Metal,"Korea, South"
130768,https://www.metal-archives.com/bands/%EC%B5%9C...,최일민,"Heavy Metal/Rock, Shred","Korea, South"
130769,https://www.metal-archives.com/bands/%ED%8F%90...,폐허,Dark Ambient/Atmospheric Black Metal,"Korea, South"
130770,https://www.metal-archives.com/bands/%ED%94%BC...,피해의식,Heavy Metal,"Korea, South"


In [10]:
def get_band_data_chunk(chunk_num, chunk_id):
    print('Chunk:', chunk_id, '/', chunk_num)
    
    chunk_size = bands.shape[0] // chunk_num
    begin = chunk_size * chunk_id
    end = begin + chunk_size
    if (chunk_id == chunk_num - 1):
        end = bands.shape[0]
    
    data = list()
    for row in range(begin, end):
        print('Band:', row - begin, '/', end - begin, '-', int((row - begin) / (end - begin) * 100) , '%          ', end='\r')
        data.append(ma.get_band_data(bands.loc[row, 'url']))
    return data

In [15]:
chunk_num = int(input('Number of chunks:'))
chunk_id = int(input('ID of current chunk:'))
data = pd.DataFrame(get_band_data_chunk(chunk_num, chunk_id))
data.to_csv('ma_bands_data_' + str(chunk_id) + '.csv')

Number of chunks:10000
ID of current chunk:0
Chunk: 0 / 10000
Band: 12 / 13 - 92 %          

In [16]:
data_sample = pd.read_csv('ma_bands_data_sample.csv', index_col=0)
data_sample

Unnamed: 0,name,url,genre,theme,label,country,location,status,date,years
0,!T.O.O.H.!,https://www.metal-archives.com/bands/%21T.O.O....,Progressive Death Metal/Grindcore,"['Misanthropy', ' Violence', ' Rape', ' Deprav...",,Czechia,Prague,Active,1993,"1990-1993 (as),1993-2005,2011-2013,2017-present"
1,!úl..,https://www.metal-archives.com/bands/%21%C3%BA...,Death/Black Metal,"['Destiny', ' Emotions', ' Life']",,Czechia,Prague,Split-up,2002,2002-2011
2,$Greed$,https://www.metal-archives.com/bands/%24Greed%...,Heavy/Thrash Metal,"['Politics', ' Humanity']",Unsigned/independent,United States,"Los Angeles, California",On hold,1992,"1992-1994 (as),1994-1996 (as),1999-?"
3,$ilverdollar,https://www.metal-archives.com/bands/%24ilverd...,Heavy/Power Metal,"['Occult', ' Fantasy', ' Human issues']",,Sweden,"Nyköping, Södermanland",Active,1996,1996-present
4,$lamboy$,https://www.metal-archives.com/bands/%24lamboy...,Death Metal/Grindcore (early); Slam/Brutal Dea...,"['Memes', ' Nonsense']",Unsigned/independent,United States,"Cary, Illinois",Active,2016,"2016-2017,2017-present"
5,$lutrot,https://www.metal-archives.com/bands/%24lutrot...,Slam/Brutal Death Metal,['Gore'],,United States,Nevada / North Carolina / Texas,Split-up,2013,"2013-2015,2017-2019"
6,$uicide $olution,https://www.metal-archives.com/bands/%24uicide...,Heavy Metal,"['Darkness', ' Death', ' Horror']",,Singapore,Singapore,On hold,1998,"1998-2006,2008-2014"
7,'. . . [l]ight am I' . . .,https://www.metal-archives.com/bands/%27._._._...,Black Metal,"['Conquering', ' Destroying', ' Possessing', '...",,Canada,"Cartier/Kitchener, Ontario",Unknown,1996,1996-?
8,'Ain,https://www.metal-archives.com/bands/%27Ain/35...,Symphonic Metal,"['Poetry', ' Emotions', ' Solitude']",Unsigned/independent,Mexico,Mexico City,Active,2010,2010-present
9,'Iisnááhí,https://www.metal-archives.com/bands/%27Iisn%C...,Black Metal,"['Navajo/Native American mythology', "" Diné Ba...",Unsigned/independent,United States,"Albuquerque, New Mexico",Active,2018,2018-present


In [None]:
def get_song_chunk(chunk_num, chunk_id):
    print('Chunk:', chunk_id, '/', chunk_num)
    
    chunk_size = bands.shape[0] // chunk_num
    begin = chunk_size * chunk_id
    end = begin + chunk_size
    if (chunk_id == chunk_num - 1):
        end = bands.shape[0]
    
    songs = list()
    for row in range(begin, end):
        print('Band:', row - begin, '/', end - begin, '-', int((row - begin) / (end - begin) * 100) , '%          ', end='\r')
        songs += ma.search_song(band_name=bands.loc[row, 'name'])
    return songs

In [None]:
chunk_num = int(input('Number of chunks:'))
chunk_id = int(input('ID of current chunk:'))
songs = pd.DataFrame(get_song_chunk(chunk_num, chunk_id))
songs.to_csv('ma_songs_' + str(chunk_id) + '.csv')

In [6]:
songs_sample = pd.read_csv('ma_songs_sample.csv', index_col=0)
songs_sample

Unnamed: 0,album_url,band_name,album_name,album_type,song_name,song_id
0,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Democratic Solution,Full-length,Aura & Ziata (new version),2667201
1,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Democratic Solution,Full-length,Boubelovo životakončení,2667192
2,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Democratic Solution,Full-length,Demokratické řešení,2667195
3,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Democratic Solution,Full-length,Instrumental,2667203
4,https://www.metal-archives.com/bands/%21T.O.O....,!T.O.O.H.!,Democratic Solution,Full-length,Kokarda pýchy,2667194
...,...,...,...,...,...,...
816,https://www.metal-archives.com/bands/.../35402...,...,Suffering Existence,Demo,I.S.K,1442252
817,https://www.metal-archives.com/bands/.../35402...,...,Suffering Existence,Demo,Leave This Mortals Remains,1442251
818,https://www.metal-archives.com/bands/.../35402...,...,The Path Toward Forgetfulness,Split,Ascending to the Night Sky,2281629
819,https://www.metal-archives.com/bands/.../35402...,...,The Path Toward Forgetfulness,Split,Like Shooting Stars,2281630
