A jupyter notebook to download genre data from https://everynoise.com/

In [1]:
import requests
import json
from bs4 import BeautifulSoup as bs

In [2]:
# download and parse page
r = requests.get('https://everynoise.com/')
soup = bs(r.text, features='lxml')

# get canvas
canvases = soup.find_all('div', attrs={'class': 'canvas'})
assert len(canvases) == 1, ''
canvas = canvases[0]

# get data from styles
# used below as well
def extract_style_elems(element):
    style = element['style']
    style_dict = {}
    
    # Split style string into key-value pairs
    for item in style.split(';'):
        if ':' in item:
            key, value = item.split(':', 1)
            style_dict[key.strip()] = value.strip()

    return style_dict

# get canvas width and height
canvas_style_elems = extract_style_elems(canvas)
canvas_height = int(canvas_style_elems['height'].replace('px', ''))
canvas_width = int(canvas_style_elems['width'].replace('px', ''))
print('Canvas Dimensions:', canvas_height, canvas_width)

Canvas Dimensions: 22683 1610


In [3]:
# get data from an element
def extract_genre_details(element, canvas_height, canvas_width):
    name = element.text.replace('Â» ', '')
    style = extract_style_elems(element)
    x = int(style['left'].replace('px', ''))
    y = canvas_height - int(style['top'].replace('px', ''))
    color = style['color']
    color_sep = tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
    fontsize = int(style['font-size'].replace('%', ''))
    return {
        'name': name,
        'x': x,
        'y': y,
        'r': color_sep[0],
        'g': color_sep[1],
        'b': color_sep[2],
        'color': color,
        'fontsize': fontsize,
    }

# extract all genre data
genre_attrs = {}
for genre_element in canvas.find_all('div'):
    genre_details = extract_genre_details(genre_element, canvas_height, canvas_width)
    name = genre_details['name']
    genre_attrs[name] = genre_details
print('Genre Count:', len(genre_attrs))

Genre Count: 6291


In [4]:
# Define the attributes to normalize
attributes = ['x', 'y', 'r', 'g', 'b']

# Normalize each attribute
for attr in attributes:
    min_attr = min([i[attr] for i in genre_attrs.values()])
    max_attr = max([i[attr] for i in genre_attrs.values()])
    for genre in genre_attrs:
        value = genre_attrs[genre][attr]
        normalized_value = round((value - min_attr) / (max_attr - min_attr) * 100, 2)
        genre_attrs[genre][f'{attr}_norm'] = normalized_value


In [5]:
# add a few extra
genre_attrs['indie'] = genre_attrs['la indie']
genre_attrs['canadian indie rock'] = genre_attrs['indie rock']
genre_attrs['french house'] = genre_attrs['filter house']

In [6]:
# dump data to disk
with open('cache/genre_attrs.json', 'w') as file:
    json.dump(genre_attrs, file)