# Scraping various critics' lists of the best albums of the 2010s

___

## Scrape for links to critics' lists

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "https://www.albumoftheyear.org/list/summary/2010s/"
r = requests.get(url)
soup = BeautifulSoup(r.content)

links = []
p = re.compile('(.*) [([]20.*')
for link in soup.find('div', string="Recent Additions").find_parent().find_all('a'):
    url = f"https://www.albumoftheyear.org{link.get('href')}"
    try:
        source_name = p.match(link.string)[1]
    except:
        source_name = link.string
    links.append([source_name, url])
del links[39]
df = pd.DataFrame(links, columns=['source', 'link'])
df.head()

Unnamed: 0,source,link
0,Sputnikmusic,https://www.albumoftheyear.org/list/1490-sputn...
1,SPIN,https://www.albumoftheyear.org/list/1480-spins...
2,Resident Advisor,https://www.albumoftheyear.org/list/1461-resid...
3,Revolver,https://www.albumoftheyear.org/list/1460-revol...
4,Mixmag,https://www.albumoftheyear.org/list/1459-mixma...


In [5]:
url = 'https://www.metacritic.com/feature/best-albums-of-the-decade-2010s'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content)

links = []
p = re.compile('(.*?)View.*')
for link in soup.find_all('th', {'class': 'criticname'}):
    source = p.match(link.text)[1].strip()
    if source not in df.source.to_list():
        links.append([source, link.a.get('href')])
df = pd.DataFrame(links, columns=['source', 'link'])

## Scrape for rankings in each list

In [2]:
def parse_aoty(url, source):
    p = re.compile('(.*)Full')
    p_name = re.compile("(.*) - (.*)")
    ranks = []
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    length = len(soup.find_all('div', {'class': 'albumListRow'}))
    for item in soup.find_all('div', {'class': 'albumListRow'}):
        try:
            rank = item.find('span', {'itemprop': 'position'}).text
        except:
            rank = length
        name = p_name.match(item.find('a', {'itemprop': 'url'}).text)
        artist = name[1].strip()
        album = name[2].strip()
        date = item.find('div', {'class': 'albumListDate'}).text
        link = item.find('a', {'itemprop': 'url'}).get('href')
        search_source = item.find('source', {'media': '(min-width: 1024px)'})['data-srcset']
        try:
            blurb = p.match(item.find('div', {'class': 'albumListBlurb'}).text)[1]
        except:
            blurb = ''
        ranks.append([source, rank, artist, album, date, link, blurb])
    next_page = soup.find('div', {'class': 'pageSelect next'})    
    if next_page:
        next_url = 'https://www.albumoftheyear.org' + next_page.parent.get('href')
        ranks.extend(parse_aoty(next_url, source))
    return ranks

In [3]:
ranks = []
parse_result = df.apply(lambda x: pd.DataFrame(parse_aoty(x['link'], x['source']), 
                                               columns=['source', 'rank', 'artist', 
                                                        'album', 'date', 'link', 'blurb']), axis=1) 

In [4]:
df_rank_0 = pd.concat(parse_result.values, ignore_index=True)

Various sites have different layouts, hence needing individualized code to scrape. As follows are some examples:

In [6]:
import numpy as np
r = requests.get(df_2.link[0])
soup = BeautifulSoup(r.content, "html.parser")
ranking, artist, album = [], [], []
blurbs= []
for i in range(1, 39, 2):
    blurbs.append(soup.find_all('p')[i].text)
blurbs.append(soup.find_all('p')[41].text)

p = re.compile("([0-9]*?) (.*?) – (.*)")
for i in soup.find_all('h1')[1:]:
    match = p.match(i.text)
    ranking.append(match[1])
    artist.append(match[2].strip())
    album.append(match[3].strip())

df_all_things_aloud = pd.DataFrame(np.array([ranking, artist, album, blurbs]).T, 
                                   columns=['rank', 'artist', 'album', 'blurb'])
df_all_things_aloud['source'] = "All Things Aloud"
df_rank = df_rank_0.append(df_all_things_aloud, ignore_index=True)

In [7]:
r = requests.get(df_2.link[1])
soup = BeautifulSoup(r.content, "html.parser")
p = re.compile("([0-9]*). (.*) [–-] (.*) \((201[0-9])\)(.*)")
items = []
for i in range(3, 62, 2):
    m = p.match(soup.find_all('p')[i].text)
    items.append([m[1], m[2], m[3], m[4], m[5]])
df_bills = pd.DataFrame(items, columns=['rank', 'artist', 'album', 'date', 'blurb'])
df_bills['source'] = "Bill’s Indie Basement"
df_bills.replace({'S/T': 'Purple Mountains'}, inplace=True)
df_rank = df_rank.append(df_bills, ignore_index=True)

In [8]:
r = requests.get(df_2.link[2])
soup = BeautifulSoup(r.content, "html.parser")
p = re.compile("“(.*),” (.*), (201[0-9]) — (.*)")
items = []
for i in soup.find_all('p')[4:14]:
    m = p.match(i.text)
    items.append([m[1], m[2], m[3], m[4]])
df_boston = pd.DataFrame(items, columns=['album', 'artist', 'date', 'blurb'])
df_boston['source'] = "Boston Herald"
df_boston['rank'] = 10
df_rank = df_rank.append(df_boston, ignore_index=True)

In [10]:
r = requests.get(df_2.link[14])
soup = BeautifulSoup(r.content, "html.parser")
p = re.compile("([0-9]*). (.*) – (.*)")
items = []
for i in soup.find_all('p')[10:20]:
    m = p.match(i.text)
    items.append([m[1], m[2], m[3]])
df_national = pd.DataFrame(items, columns=['rank', 'artist', 'album'])
df_national.replace({"My Beautiful Dark Fantasy": 'My Beautiful Dark Twisted Fantasy'}, inplace=True)
df_national['source'] = "National Post"
df_rank = df_rank.append(df_national, ignore_index=True)

In [12]:
r = requests.get(df_2.link[16])
soup = BeautifulSoup(r.content, "html.parser")
p = re.compile("([0-9]*). “(.*)” (.*) \((201[0-9])\)")
items = []
lorde = p.match(soup.find('strong').text)
blurbs = [i.text for i in soup.find_all('p')[5:-1] if i.text != '']
del blurbs[3]
items.append([lorde[1], lorde[2][:-1], lorde[3], lorde[4]])
for i in soup.find_all('h4')[1:]:
    m = p.match(i.text)
    items.append([m[1], m[2][:-1], m[3], m[4]])
df_mercury = pd.DataFrame(items, columns=['rank', 'album', 'artist', 'date'])
df_mercury['source'] = "Mercury News"
df_mercury['blurb'] = blurbs
df_rank = df_rank.append(df_mercury, ignore_index=True)

In [13]:
import unicodedata
r = requests.get(df_2.link.to_list()[-1])
soup = BeautifulSoup(r.content, "html.parser")
p = re.compile("([0-9]+)[.] (.+?) [–—] (.+?) [–—] .+?(201[0-9])")
items = []
for i in soup.find_all('h3')[13:-9]:
    m = p.match(unicodedata.normalize('NFKC', i.text))
    items.append([m[1], m[2], m[3], m[4], i.find_next('p').text])
    
df_wxpn = pd.DataFrame(items, columns=['rank', 'artist', 'album', 'date', 'blurb'])
df_wxpn['source'] = "WXPN/The Key"
df_rank = df_rank.append(df_wxpn, ignore_index=True)

## Data cleaning

1. Some artist/album names are stylized differently across different sites, hence the need to standardize them.

In [44]:
df_rank['album'].replace({'2017': '2012 - 2017'}, inplace=True)
replace_dict = {'22, a Million': '22, A Million', 'A Crow Looked At Me': 'A Crow Looked at Me',
                'Anti': "ANTI", 'Emotion': 'E•MO•TION', 
                'When We All Fall Asleep, Where Do We Go?': 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?',
                'A I A : Alien Observer': 'A I A', 'A I A : Dream Loss': 'A I A',
                'Born To Die: Paradise Edition': 'Born to Die', 'EL MAL QUERER': 'El Mal Querer', 
                'Idles': 'IDLES', 'Rosalía': 'ROSALÍA', 'Channel Orange': 'channel ORANGE',
                '...Like Clockwork': '…Like Clockwork', 'Against All Logic - 2012': 'Against All Logic', 
                'We Got it From Here…Thank U 4 Your Service': 'We got it from Here... Thank You 4 Your service',
                'In Love With Oblivion': 'In Love with Oblivion', 'mbv': 'm b v', 'Beyonce': 'Beyoncé',
                'House of Balloons': 'House of Balloons / Trilogy', 'Trilogy': 'House of Balloons / Trilogy'}
df_rank.replace(replace_dict, inplace=True)
df_rank["rank"] = df_rank["rank"].astype(int)

2. Some information are present at certain sites but missing from others, so we need to fill in the missing rows using values in other rows with the same album/artist

In [47]:
info = df_rank[['artist', 'album','date']].groupby(['artist', 'album'], dropna=False)
info = info.agg(lambda x:x.value_counts(dropna=False).index[0])
fill_dict = info.to_dict()
def replace(row):
    for i in list(fill_dict):
        row[i] = fill_dict[i][(row['artist'], row['album'])]
    return row
df_rank = df_rank.apply(replace, axis=1)

## Aggregating data

In [30]:
def score(rank):
    """
    Function to assign a score given the ranking of an album
    """
    if rank==1:
        return 10
    elif rank==2:
        return 8
    elif rank==3:
        return 6
    elif rank<=10:
        return 5
    elif rank<=25:
        return 3
    elif rank<=50:
        return 2
    else:
        return 1

In [73]:
df_rank['score'] = df_rank["rank"].apply(score)
count = df_rank.groupby(['artist', 'album', 'score'], 
                        dropna=False).size().unstack().fillna(0).astype(int)
total = df_rank.groupby(['artist', 'album'], 
                        dropna=False)['score'].agg(["count", "sum"], dropna=False)
album_list.columns = ['Artist', 'Album', '# Other', '# Top 50', '# Top 25', 
                    '# Top 10', '# 3rd Place', '# 2nd Place', '# 1st Place', 
                    'Total # of mentions', 'Total score', 'Released date', 'Link']
album_list = album_list.sort_values(by=["Total score", "Total # of mentions", 
                                        '# 1st Place', '# 2nd Place', '# 3rd Place',
                                        '# Top 10', '# Top 25', 
                                        '# Top 50', '# Other', 'Artist', 'Album'], 
                                    ascending=False)
album_list.reset_index(inplace=True, drop=True)
album_list

Unnamed: 0,Artist,Album,# Other,# Top 50,# Top 25,# Top 10,# 3rd Place,# 2nd Place,# 1st Place,Total # of mentions,Total score,Released date,Link
0,Kendrick Lamar,To Pimp a Butterfly,2,1,6,11,2,5,10,37,229,"March 16, 2015",https://www.albumoftheyear.org/album/29250-ken...
1,Kanye West,My Beautiful Dark Twisted Fantasy,1,1,7,13,2,4,5,33,183,"November 22, 2010",https://www.albumoftheyear.org/album/1998-kany...
2,Beyoncé,Lemonade,3,2,5,11,2,8,2,33,173,"April 23, 2016",https://www.albumoftheyear.org/album/54075-bey...
3,Frank Ocean,channel ORANGE,3,3,7,12,2,1,0,28,110,"July 17, 2012",https://www.albumoftheyear.org/album/3772-fran...
4,Kendrick Lamar,"good kid, m.A.A.d. city",3,5,5,7,2,1,2,25,103,"October 22, 2012",https://www.albumoftheyear.org/album/3840-kend...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,33EMYBW,Arthropods,1,0,0,0,0,0,0,1,1,"October 11, 2019",https://www.albumoftheyear.org/album/194341-33...
1269,21 Savage & Metro Boomin,Savage Mode,1,0,0,0,0,0,0,1,1,"July 14, 2016",https://www.albumoftheyear.org/album/57922-21-...
1270,18+,MIXTAP3,1,0,0,0,0,0,0,1,1,"October 25, 2013",https://www.albumoftheyear.org/album/12107-18-...
1271,100s,Ice Cold Perm,1,0,0,0,0,0,0,1,1,"September 5, 2012",https://www.albumoftheyear.org/album/190338-10...
