# Everynoise.com

*This version: 6 July 2018*
*Updated on 8 April 2019*

Comments: h.datta@tilburguniversity.edu

**Requires Python 3.x**

## Preamble
Installs / loads a bunch of packages required to conduct web scraping. Run this before you go to the next cells.

In [12]:
# Setup and install necessary packages
import lxml
import selenium
import pandas
import bs4

# Load packages into memory
import urllib3
import datetime
from lxml import etree 
import time
import codecs
import io
from lxml.cssselect import CSSSelector
from lxml.etree import fromstring
import re
import os

# Function to create a new directory if it does not exist yet
def makedir(dirname):
    try:
        os.stat(dirname)
    except:
        os.mkdir(dirname)

from bs4 import BeautifulSoup
import pandas
import urllib.request

In [13]:
# a few functions
def load_page(url):
    request=urllib.request.urlopen(url)
    request.headers.getparam('charset')
    encoding='UTF-8'
    content=request.read().decode(encoding)
    return(content)

# Collect all available dates

In [14]:
# load page
page=load_page('http://everynoise.com/sorting_hat_closet/')

# extract dates
tree = etree.fromstring(page, parser=etree.HTMLParser())

elements=tree.xpath('/html/body/a')
dates=[]
for el in elements:
    dates.append(el.text)
print("Found " + str(len(dates)) + " dates.")

HTTPError: HTTP Error 404: Not Found

In [None]:
# Download all pages
makedir('raw')

for date in dates:
    url='http://everynoise.com/sorting_hat_closet/spotify_new_releases_'+ date+'.html'
    print('downloading ' + url + '...')
    content=load_page(url)
    f=io.open('raw/'+date+'.html','w',encoding='UTF-8')
    f.write(content)
    f.close()
    print('...sleeping 10 sec...')
    time.sleep(10)
print('done.')

# Functions to parse content from release website

In [5]:
# extract release IDs
def get_albumid(item):
    
    albumid=item.find('div', attrs={"class" : "albumwrapper play"}).get('albumid')
    return(albumid)

# extract all release IDs in a genre section
def get_albums_from_genre(genresection):
    items=genresection.findAll("div", {'class': re.compile("albumbox")})
    out = []
    for i in items:
        out.append(get_albumid(i))
    return(out)

# parses album info from right side of screen
def get_album_info(item):

    try:
        albumid=item.find('span', attrs={"class" : "play trackcount"}).get('albumid')
    except:
        albumid=item.find('a', href=True)

    artistname=item.find('span', attrs={"class" : "creditartist"}).string
    try:
        releasename=item.find('span', attrs={"class" : "creditrelease"}).string
        if releasename is None: releasename= item.find('span', attrs={"class" : "creditrelease"}).encode('UTF-8')
    except:
        releasename = '' 
        
    try:
        artistrank=item.get('title').replace('artist rank: ','')
    except:
        artistrank = 'NA'
    try:
        playtrackcount=item.find('span', attrs={"class" : "play trackcount"}).string
    except:
        try:
            playtrackcount=item.find('span', attrs={"class" : "play trackcount singletrack"}).string
        except:
            playtrackcount = 'NA'
    classname='-'.join(item.get('class'))
    
    return([artistname,releasename,albumid,artistrank,playtrackcount,classname])


# find all genre sections and save album IDs and relevant genres
def get_all_genres(soup):
    genres = soup.findAll("div", {"class": "leftgenre genre"})

    genredata=[]

    for genre in genres:
        genre.find("span", {'class': 'number'}).string
        genr_string = ""
        for genr in genre.findAll("a", {"class": "clustergenre"}):
            genr_string=genr_string+genr.string+'|'
        #print(genr_string)
        #print(ct)
        #ct=ct+1
        content = get_albums_from_genre(genre)

        for conn in content:
            genredata.append([conn, genr_string])

    genr=pandas.DataFrame(genredata)
    genr.rename(columns={0: 'albumid', 1: 'genre'}, inplace=True)
    return(genr)

# gets all releases from right sight of the screen
def get_all_releases(soup):
    #allreleases = soup.find("td", {"class": "allreleases"})
    unfiltered_items=soup.findAll("div", {'class': re.compile("album|other")})
    # loop to identify correct elements
    items=[]
    for i in unfiltered_items:
        if 'class="other "' in i.encode('UTF-8'): items.append(i)
        if 'class="other single"' in i.encode('UTF-8'): items.append(i)
        if 'class="album "' in i.encode('UTF-8'): items.append(i)
        if 'class="album single"' in i.encode('UTF-8'): items.append(i)
    
    out = []
    c=0
    for i in items:
        c=c+1
        out.append(get_album_info(i))
    releases=pandas.DataFrame(out)
    releases.rename(columns={0: 'artistname', 1: 'albumname', 2: 'albumid', 3:'artistrank', 4: 'playcountrank', 5: 'releasetype'}, inplace=True)

    return(releases)

In [6]:
# function to process all data: takes downloaded raw html file as input
def process(fn='c:/Users/hanne/Dropbox/Tilburg/Projects/NewReleases/data/raw/2018-06-29.html'):
    print('processing: '+fn)
    # read file
    g=io.open(fn,'r', encoding='UTF-8')
    # turn into soup
    soup = BeautifulSoup(g.read(), 'html.parser')
    g.close()
    # extract date (used to store in table and file name)
    date=soup.find('div', attrs={"class" : "title"}).string[-10:]
    print('getting genres')
    genres=get_all_genres(soup)
    print('getting all releases')
    releases=get_all_releases(soup)
    # merging genres to all releases
    print('merging')
    df2=releases.merge(genres, how='left', on='albumid')
    # saving
    df2['date'] = date
    df2.to_csv('out/spotify-releases_'+ date+ '.csv', encoding= 'UTF-8',sep='\t', index=False)
    print('done')

In [7]:
# try out for one date
process()

processing: c:/Users/hanne/Dropbox/Tilburg/Projects/NewReleases/data/raw/2018-06-29.html
getting genres
getting all releases
merging
done


# Parse all files

In [None]:
import pandas

In [None]:
# collect files
mypath='raw/'
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

for fn in onlyfiles:
    process(fn='raw/'+fn)